rm(list=ls())
library(ezids)
library(tidyverse)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)
library(tigris)
library(sf)
library(ggridges)
library(scales)
library(tidycensus)
library(here)
library(car)
library(caTools)
library(glmnet)
library(randomForest)
library(rpart)
library(rpart.plot)
library(rattle)
library(kableExtra)
library(ModelMetrics)

Introduction

Conclusion

project 2

Data

This project uses the cbsa-level 2024 Realtor.com dataset referenced as the source data from our Project 1 data source: the American Enterprise Institute. We enrich this dataset with 1-year 2024 data from the American Community Survey. The variables are:

Original Variable Name New Variable Name Definition
month_date_yyyymm month_date_yyyymm Month/Year date in Realtor.com dataset
cbsa_code cbsa_code Core-Based Statistical Area (CBSA) code
cbsa_title cbsa_title CBSA name/title
median_listing_price_per_square_foot median_listing_price_per_square_foot Median listing price per square foot
total_listing_count total_listing_count Total number of listings in the CBSA/month
pending_ratio pending_ratio Share of listings pending sale
median_days_on_market median_days_on_market Median number of days listings are on the market
log_median_price_sqft log_median_price_sqft Log of median listing price per square foot
DP05_0001E total_population Total population (ACS)
DP05_0003PE pct_female Percent female
DP05_0037PE pct_white Percent White alone
DP05_0045PE pct_black Percent Black alone
DP05_0053PE pct_aian Percent American Indian/Alaska Native
DP05_0061PE pct_asian Percent Asian alone
DP05_0069PE pct_nhpi Percent Native Hawaiian/Pacific Islander
DP05_0075PE pct_two_or_more Percent two or more races
DP05_0090PE pct_hispanic Percent Hispanic or Latino
DP05_0018E median_age Median age
DP05_0019PE pct_under_18 Percent under 18 years old
DP02_0001E total_households Total households
DP02_0002PE pct_married_couple Percent married-couple households
DP02_0006PE pct_male_householder Percent male householder, no spouse
DP02_0010PE pct_female_householder Percent female householder, no spouse
DP03_0005PE pct_unemployed Percent unemployed
DP03_0119PE pct_below_poverty Percent below poverty level
DP03_0053PE pct_income_10_14999 Percent income $10k–$14,999
DP03_0054PE pct_income_15_24999 Percent income $15k–$24,999
DP03_0055PE pct_income_25_34999 Percent income $25k–$34,999
DP03_0056PE pct_income_35_49999 Percent income $35k–$49,999
DP03_0057PE pct_income_50_74999 Percent income $50k–$74,999
DP03_0058PE pct_income_75_99999 Percent income $75k–$99,999
DP03_0059PE pct_income_100_149k Percent income $100k–$149k
DP03_0060PE pct_income_150_199k Percent income $150k–$199k
DP03_0061PE pct_income_200k_plus Percent income $200k+
DP03_0062E median_household_income Median household income
DP02_0067PE pct_high_school_plus Percent high school or higher
DP02_0068PE pct_bachelors_plus Percent bachelor’s degree or higher
DP04_0001E total_housing_units Total housing units
DP04_0003PE pct_vacant_units Percent vacant housing units
DP04_0046PE pct_owner_occupied Percent owner-occupied housing units
DP04_0017PE pct_built_2020_plus Percent built 2020+
DP04_0018PE pct_built_2010_2019 Percent built 2010–2019
DP04_0019PE pct_built_2000_2009 Percent built 2000–2009
DP04_0020PE pct_built_1990_1999 Percent built 1990–1999
DP04_0021PE pct_built_1980_1989 Percent built 1980–1989
DP04_0022PE pct_built_1970_1979 Percent built 1970–1979
DP04_0023PE pct_built_1960_1969 Percent built 1960–1969
DP04_0024PE pct_built_1950_1959 Percent built 1950–1959
DP04_0025PE pct_built_1940_1949 Percent built 1940–1949
DP04_0040PE pct_1_bed Percent 1-bedroom units
DP04_0041PE pct_2_bed Percent 2-bedroom units
DP04_0042PE pct_3_bed Percent 3-bedroom units
DP04_0043PE pct_4_bed Percent 4-bedroom units
DP04_0044PE pct_5plus_bed Percent 5+ bedroom units

Data Sources

American Community Survey

#ACS variable codes 
#use to make a dictionary for the report

#demographics

#DP05_0001E - Total Population

#DP05_0003PE – % Female
#DP05_0037PE – % White alone
#DP05_0045PE – % Black or African American alone
#DP05_0053PE - % American Indian and Alaska Native alone
#DP05_0061PE – % Asian alone
#DP05_0069PE - % Native Hawaiian and Other Pacific Islander alone
#DP05_0075PE - % Two or More Races

#DP05_0090PE – % Hispanic or Latino

#DP05_0018E - Median age (years)
#DP05_0019PE - % Under 18 years

#households

#DP02_0001E – Total households
#DP02_0002PE – % Married-couple households
#DP02_0006PE – % Male householder, no spouse
#DP02_0010PE – % Female householder, no spouse

#economics

#DP03_0005PE – % Unemployed
#DP03_0119PE – % Below poverty level

#DP03_0052PE - % Household income <$10,000 (ELIMINATE FOR MULTICOLLINEARITY)
#DP03_0053PE - % Household income $10,000 to $14,999
#DP03_0054PE - % Household income $15,000 to $24,999
#DP03_0055PE - % Household income $25,000 to $34,999
#DP03_0056PE - % Household income $35,000 to $49,999
#DP03_0057PE - % Household income $50,000 to $74,999
#DP03_0058PE – % Household income $75,000 to $99,999
#DP03_0059PE – % Household income $100,000 to $149,999
#DP03_0060PE – % Household income $150,000 to $199,999
#DP03_0061PE – % Household income $200,000 or more

#DP03_0062E - Median household income (dollars)

#DP02_0067PE - % High school graduate or higher
#DP02_0068PE – % Bachelor’s degree or higher


#housing

#DP04_0001E – Total housing units
#DP04_0003PE – % Vacant units

#DP04_0046PE – % of occupied that are Owner-occupied

#DP04_0017PE - % Units built 2020 or later
#DP04_0018PE - % Units built 2010 to 2019
#DP04_0019PE – % Units built 2000 to 2009
#DP04_0020PE – % Units built 1990 to 1999
#DP04_0021PE – % Units built 1980 to 1989
#DP04_0022PE – % Units built 1970 to 1979
#DP04_0023PE – % Units built 1960 to 1969
#DP04_0024PE – % Units built 1950 to 1959
#DP04_0025PE – % Units built 1940 to 1949
#DP04_0026PE – % Units built 1939 or earlier (ELIMINATE FOR MULTICOLLINEARITY)

#DP04_0039PE - % 0 Bedrooms (ELIMINATE FOR MULTICOLLINEARITY)
#DP04_0040PE - % 1 Bedrooms
#DP04_0041PE - % 2 Bedrooms
#DP04_0042PE - % 3 Bedrooms
#DP04_0043PE - % 4 Bedrooms
#DP04_0044PE - % 5 Bedrooms or more
census_api_key("29e7dfea2f8b253a0a10ccd9626f78e49f4f0a4f")

# census_vars = load_variables(year = 2024, dataset = "acs1/profile", cache = TRUE)
# census_vars_limited = grepv("^(?!DP02PR).*P$", census_vars$name, perl=TRUE)
# acs1_2024 = get_acs(geography = "metropolitan statistical area/micropolitan statistical area",
#                     variables = census_vars_limited,
#                     year = 2024,
#                     survey = "acs1",
#                     output = "wide")
# head(acs1_2024)
vars_needed <- c("NAME",
                 "GEOID",
                 "DP05_0001E", 
                 "DP05_0003PE", 
                 "DP05_0037PE", 
                 "DP05_0045PE", 
                 "DP05_0053PE", 
                 "DP05_0061PE", 
                 "DP05_0069PE", 
                 "DP05_0075PE", 
                 "DP05_0090PE", 
                 "DP05_0018E", 
                 "DP05_0019PE", 
                 "DP02_0001E", 
                 "DP02_0002PE", 
                 "DP02_0006PE", 
                 "DP02_0010PE", 
                 "DP03_0005PE", 
                 "DP03_0119PE", 
                 # "DP03_0052PE", 
                 "DP03_0053PE", 
                 "DP03_0054PE", 
                 "DP03_0055PE", 
                 "DP03_0056PE", 
                 "DP03_0057PE", 
                 "DP03_0058PE", 
                 "DP03_0059PE",
                 "DP03_0060PE", 
                 "DP03_0061PE", 
                 "DP03_0062E",
                 "DP02_0067PE", 
                 "DP02_0068PE",
                 "DP04_0001E", 
                 "DP04_0003PE", 
                 "DP04_0046PE",
                 "DP04_0017PE", 
                 "DP04_0018PE",
                 "DP04_0019PE", 
                 "DP04_0020PE",
                 "DP04_0021PE", 
                 "DP04_0022PE", 
                 "DP04_0023PE", 
                 "DP04_0024PE",
                 "DP04_0025PE", 
                 # "DP04_0026PE",
                 # "DP04_0039PE", 
                 "DP04_0040PE", 
                 "DP04_0041PE",
                 "DP04_0042PE", 
                 "DP04_0043PE", 
                 "DP04_0044PE")

acs1_2024_orig = get_acs(geography = "metropolitan statistical area/micropolitan statistical area",
                         variables = vars_needed,
                         year = 2024,
                         survey = "acs1",
                         output = "wide")

head(acs1_2024_orig)
## # A tibble: 6 × 94
##   GEOID NAME           DP05_0001E DP05_0001M DP05_0003PE DP05_0003PM DP05_0037PE
##   <chr> <chr>               <dbl>      <dbl>       <dbl>       <dbl>       <dbl>
## 1 10140 Aberdeen, WA …      77893         NA        49.5         0.8        75.5
## 2 10180 Abilene, TX M…     181969       3181        48.2         0.7        66.7
## 3 10300 Adrian, MI Mi…      97746         NA        50.2         0.6        85.6
## 4 10380 Aguadilla, PR…     250969         NA        51.4         1          52.1
## 5 10420 Akron, OH Met…     702209         NA        51.5         0.2        76  
## 6 10460 Alamogordo, N…      69711         NA        47           1.6        53.9
## # ℹ 87 more variables: DP05_0037PM <dbl>, DP05_0045PE <dbl>, DP05_0045PM <dbl>,
## #   DP05_0053PE <dbl>, DP05_0053PM <dbl>, DP05_0061PE <dbl>, DP05_0061PM <dbl>,
## #   DP05_0069PE <dbl>, DP05_0069PM <dbl>, DP05_0075PE <dbl>, DP05_0075PM <dbl>,
## #   DP05_0090PE <dbl>, DP05_0090PM <dbl>, DP05_0018E <dbl>, DP05_0018M <dbl>,
## #   DP05_0019PE <dbl>, DP05_0019PM <dbl>, DP02_0001E <dbl>, DP02_0001M <dbl>,
## #   DP02_0002PE <dbl>, DP02_0002PM <dbl>, DP02_0006PE <dbl>, DP02_0006PM <dbl>,
## #   DP02_0010PE <dbl>, DP02_0010PM <dbl>, DP03_0005PE <dbl>, …
acs1_2024_no_M = select(acs1_2024_orig, -grepv("M$", names(acs1_2024_orig)))
head(acs1_2024_no_M)
## # A tibble: 6 × 48
##   GEOID NAME          DP05_0001E DP05_0003PE DP05_0037PE DP05_0045PE DP05_0053PE
##   <chr> <chr>              <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
## 1 10140 Aberdeen, WA…      77893        49.5        75.5         1.3         4.2
## 2 10180 Abilene, TX …     181969        48.2        66.7         8.5         0.4
## 3 10300 Adrian, MI M…      97746        50.2        85.6         2.4         0.2
## 4 10380 Aguadilla, P…     250969        51.4        52.1         1.2         0  
## 5 10420 Akron, OH Me…     702209        51.5        76          12.3         0.2
## 6 10460 Alamogordo, …      69711        47          53.9         4.4         6.5
## # ℹ 41 more variables: DP05_0061PE <dbl>, DP05_0069PE <dbl>, DP05_0075PE <dbl>,
## #   DP05_0090PE <dbl>, DP05_0018E <dbl>, DP05_0019PE <dbl>, DP02_0001E <dbl>,
## #   DP02_0002PE <dbl>, DP02_0006PE <dbl>, DP02_0010PE <dbl>, DP03_0005PE <dbl>,
## #   DP03_0119PE <dbl>, DP03_0053PE <dbl>, DP03_0054PE <dbl>, DP03_0055PE <dbl>,
## #   DP03_0056PE <dbl>, DP03_0057PE <dbl>, DP03_0058PE <dbl>, DP03_0059PE <dbl>,
## #   DP03_0060PE <dbl>, DP03_0061PE <dbl>, DP03_0062E <dbl>, DP02_0067PE <dbl>,
## #   DP02_0068PE <dbl>, DP04_0001E <dbl>, DP04_0003PE <dbl>, …
#This includes both percents as well as margins of error. I (Alex) have included code to drop the margins of error.
#rename ACS variable codes to real things
acs1_2024 <- acs1_2024_no_M %>%
  rename(total_population = DP05_0001E,
    pct_female = DP05_0003PE,
    pct_white = DP05_0037PE,
    pct_black = DP05_0045PE,
    pct_aian = DP05_0053PE,
    pct_asian = DP05_0061PE,
    pct_nhpi = DP05_0069PE,
    pct_two_or_more = DP05_0075PE,
    pct_hispanic = DP05_0090PE,
    median_age = DP05_0018E,
    pct_under_18 = DP05_0019PE,
    total_households = DP02_0001E,
    pct_married_couple = DP02_0002PE,
    pct_male_householder = DP02_0006PE,
    pct_female_householder = DP02_0010PE,
    pct_unemployed = DP03_0005PE,
    pct_below_poverty = DP03_0119PE,
    # pct_income_lt10k = DP03_0052PE,
    pct_income_10_14999 = DP03_0053PE,
    pct_income_15_24999 = DP03_0054PE,
    pct_income_25_34999 = DP03_0055PE,
    pct_income_35_49999 = DP03_0056PE,
    pct_income_50_74999 = DP03_0057PE,
    pct_income_75_99999 = DP03_0058PE,
    pct_income_100_149k = DP03_0059PE,
    pct_income_150_199k = DP03_0060PE,
    pct_income_200k_plus = DP03_0061PE,
    median_household_income = DP03_0062E,
    pct_high_school_plus = DP02_0067PE,
    pct_bachelors_plus = DP02_0068PE,
    total_housing_units = DP04_0001E,
    pct_vacant_units = DP04_0003PE,
    pct_owner_occupied = DP04_0046PE,
    pct_built_2020_plus = DP04_0017PE,
    pct_built_2010_2019 = DP04_0018PE,
    pct_built_2000_2009 = DP04_0019PE,
    pct_built_1990_1999 = DP04_0020PE,
    pct_built_1980_1989 = DP04_0021PE,
    pct_built_1970_1979 = DP04_0022PE,
    pct_built_1960_1969 = DP04_0023PE,
    pct_built_1950_1959 = DP04_0024PE,
    pct_built_1940_1949 = DP04_0025PE,
    # pct_built_1939_earlier = DP04_0026PE,
    # pct_0_bed = DP04_0039PE,
    pct_1_bed = DP04_0040PE,
    pct_2_bed = DP04_0041PE,
    pct_3_bed = DP04_0042PE,
    pct_4_bed = DP04_0043PE,
    pct_5plus_bed = DP04_0044PE)

# #check it out
head(acs1_2024)
## # A tibble: 6 × 48
##   GEOID NAME  total_population pct_female pct_white pct_black pct_aian pct_asian
##   <chr> <chr>            <dbl>      <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 10140 Aber…            77893       49.5      75.5       1.3      4.2       1.6
## 2 10180 Abil…           181969       48.2      66.7       8.5      0.4       2  
## 3 10300 Adri…            97746       50.2      85.6       2.4      0.2       0.3
## 4 10380 Agua…           250969       51.4      52.1       1.2      0         0  
## 5 10420 Akro…           702209       51.5      76        12.3      0.2       4.3
## 6 10460 Alam…            69711       47        53.9       4.4      6.5       1.5
## # ℹ 40 more variables: pct_nhpi <dbl>, pct_two_or_more <dbl>,
## #   pct_hispanic <dbl>, median_age <dbl>, pct_under_18 <dbl>,
## #   total_households <dbl>, pct_married_couple <dbl>,
## #   pct_male_householder <dbl>, pct_female_householder <dbl>,
## #   pct_unemployed <dbl>, pct_below_poverty <dbl>, pct_income_10_14999 <dbl>,
## #   pct_income_15_24999 <dbl>, pct_income_25_34999 <dbl>,
## #   pct_income_35_49999 <dbl>, pct_income_50_74999 <dbl>, …
#I want to make a area type var here before merging 
#create area_type var
acs1_2024 <- acs1_2024 %>%
  mutate(area_type = ifelse(grepl("Micro Area",NAME), "Micro", "Metro"))

unique(acs1_2024$area_type)
## [1] "Micro" "Metro"
#ilgaz' correlation stuff
# acs_clean <- acs1_2024_pcts %>% drop_na()
# num_df <- acs_clean %>% 
#   select(where(is.numeric))
# num_df$dummy_target <- rnorm(nrow(num_df))
#model <- lm(dummy_target ~ ., data = num_df)
#check_collinearity(model)

# head(acs_clean)
# num_df <- acs1_2024_pcts %>% select(where(is.numeric))
# 
# corr_mat <- cor(num_df, use = "pairwise.complete.obs")
# threshold <- 0.80
# 
# high_corr <- which(abs(corr_mat) > threshold,
#                    arr.ind = TRUE)
# 
# # convert to readable table
# high_corr_pairs <- data.frame(
#   var1 = rownames(corr_mat)[high_corr[,1]],
#   var2 = colnames(corr_mat)[high_corr[,2]],
#   corr = corr_mat[high_corr]
# )
# 
# # remove duplicates (since matrix is symmetric)
# high_corr_pairs <- high_corr_pairs[high_corr_pairs$var1 < high_corr_pairs$var2, ]
# 
# 
# high_corr_pairs # 2794 out of 147,696 pairs (around 2%) have an absolute corr >= .80
# 
# #steph's sidequest
# sq_acs <- acs1_2024_pcts
# 
# #numeric-only subset
# sq_acs_num <- sq_acs %>%
#   select(where(is.numeric))
# 
# #ID'ing fake percents and set to na
# sq_acs_num <- sq_acs_num %>%
#   mutate(across(everything(),
#                 ~ ifelse(. > 100, NA, .)))
# 
# #dropping all na columns
# sq_acs_num <- sq_acs_num %>%
#   select(where(~ !all(is.na(.))))
# 
# #converting NA values to median
# sq_acs_num <- sq_acs_num %>%
#   mutate(across(everything(),
#                 ~ ifelse(is.na(.),
#                          median(., na.rm = TRUE),
#                          .)))
# 
# #dropping constant or near-constant cols
# sq_acs_num <- sq_acs_num %>%
#   select(where(function(col) {
#     s <- sd(col, na.rm = TRUE)
#     if (is.na(s)) s <- 0
#     s > 10
#   }))
# 
# #dummy target
# set.seed(123)
# sq_acs_num$fake <- rnorm(nrow(sq_acs_num))
# 
# #fitting dummy linear model
# sq_acs_mod <- lm(fake ~ ., data = sq_acs_num)
# 
# #computing vif
# compute_vif <- function(df){
#   vars <- colnames(df)
#   out <- tibble(variable = vars, vif = NA_real_)
#   
#   for(i in seq_along(vars)){
#     y <- df[[i]]
#     X <- df[, -i, drop = FALSE]
#     
# #fit model for this variable alone
#   mod <- tryCatch(lm(y ~ ., data = as.data.frame(X)),
#                     error = function(e) NULL)
#     
#     if(!is.null(mod)){
#       out$vif[i] <- tryCatch(
#         max(car::vif(mod)),
#         error = function(e) NA_real_
#       )
#     }
#   }
#   
#   out
# }
# 
# vif_results <- compute_vif(sq_acs_num)
# 
# #sort highest VIF at top
# vif_results_sorted <- vif_results %>% arrange(desc(vif))
# 
# vif_results_sorted






#team, should we just delete this chunk?- BW
#which variables to keep


# #filter down the dataset
# acs1_2024_filtered <- acs1_2024_pcts[, c("GEOID", "NAME", vars_needed)]
# 
# #rename ACS variable codes to real things
# acs1_2024_filtered <- acs1_2024_filtered %>%
#   rename(total_population = DP05_0001E,
#     pct_female = DP05_0003PE,
#     pct_white = DP05_0037PE,
#     pct_black = DP05_0045PE,
#     pct_aian = DP05_0053PE,
#     pct_asian = DP05_0061PE,
#     pct_nhpi = DP05_0069PE,
#     pct_two_or_more = DP05_0075PE,
#     pct_hispanic = DP05_0090PE,
#     median_age = DP05_0018E,
#     pct_under_18 = DP05_0019PE,
#     total_households = DP02_0001E,
#     pct_married_couple = DP02_0002PE,
#     pct_male_householder = DP02_0006PE,
#     pct_female_householder = DP02_0010PE,
#     pct_unemployed = DP03_0005PE,
#     pct_below_poverty = DP03_0119PE,
#     pct_income_lt10k = DP03_0052PE,
#     pct_income_10_14999 = DP03_0053PE,
#     pct_income_15_24999 = DP03_0054PE,
#     pct_income_25_34999 = DP03_0055PE,
#     pct_income_35_49999 = DP03_0056PE,
#     pct_income_50_74999 = DP03_0057PE,
#     pct_income_75_99999 = DP03_0058PE,
#     pct_income_100_149k = DP03_0059PE,
#     pct_income_150_199k = DP03_0060PE,
#     pct_income_200k_plus = DP03_0061PE,
#     median_household_income = DP03_0062E,
#     pct_high_school_plus = DP02_0067PE,
#     pct_bachelors_plus = DP02_0068PE,
#     total_housing_units = DP04_0001E,
#     pct_vacant_units = DP04_0003PE,
#     pct_owner_occupied = DP04_0046PE,
#     pct_built_2020_plus = DP04_0017PE,
#     pct_built_2010_2019 = DP04_0018PE,
#     pct_built_2000_2009 = DP04_0019PE,
#     pct_built_1990_1999 = DP04_0020PE,
#     pct_built_1980_1989 = DP04_0021PE,
#     pct_built_1970_1979 = DP04_0022PE,
#     pct_built_1960_1969 = DP04_0023PE,
#     pct_built_1950_1959 = DP04_0024PE,
#     pct_built_1940_1949 = DP04_0025PE,
#     pct_built_1939_earlier = DP04_0026PE,
#     pct_0_bed = DP04_0039PE,
#     pct_1_bed = DP04_0040PE,
#     pct_2_bed = DP04_0041PE,
#     pct_3_bed = DP04_0042PE,
#     pct_4_bed = DP04_0043PE,
#     pct_5plus_bed = DP04_0044PE)
# 
# # #check it out
# head(acs1_2024_filtered)
# 
# # remove a few collinear / unusable vars
# # acs1_2024_filtered <- acs1_2024_filtered %>%
# #   select(-pct_male,
# #          -pct_hispanic,
# #          -pct_65_over,
# #          -pct_renter_occupied)
# 
# #I want to make a area type var here before merging 
# #create area_type var
# acs1_2024_filtered <- acs1_2024_filtered %>%
#   mutate(area_type = ifelse(grepl("Micro Area",NAME), "Micro", "Metro"))
# 
# unique(acs1_2024_filtered$area_type)


#team, I think we should delete this chunk too. -BW

Realtor.com

#loading in inventory data
inventory_raw <- read_csv(here("../RDC_Inventory_Core_Metrics_Metro_History.csv"))

#cleaning names
clean_names(inventory_raw)
## # A tibble: 103,600 × 47
##    month_date_yyyymm cbsa_code cbsa_title    household_rank median_listing_price
##                <dbl>     <dbl> <chr>                  <dbl>                <dbl>
##  1            202510     35620 New York-New…              1               762450
##  2            202510     31080 Los Angeles-…              2              1099000
##  3            202510     16980 Chicago-Nape…              3               364900
##  4            202510     19100 Dallas-Fort …              4               425000
##  5            202510     26420 Houston-Pasa…              5               358000
##  6            202510     37980 Philadelphia…              6               379973
##  7            202510     33100 Miami-Fort L…              7               499999
##  8            202510     12060 Atlanta-Sand…              8               415000
##  9            202510     47900 Washington-A…              9               594500
## 10            202510     38060 Phoenix-Mesa…             10               495000
## # ℹ 103,590 more rows
## # ℹ 42 more variables: median_listing_price_mm <dbl>,
## #   median_listing_price_yy <dbl>, active_listing_count <dbl>,
## #   active_listing_count_mm <dbl>, active_listing_count_yy <dbl>,
## #   median_days_on_market <dbl>, median_days_on_market_mm <dbl>,
## #   median_days_on_market_yy <dbl>, new_listing_count <dbl>,
## #   new_listing_count_mm <dbl>, new_listing_count_yy <dbl>, …
#hunting down year
glimpse(inventory_raw)
## Rows: 103,600
## Columns: 47
## $ month_date_yyyymm                       <dbl> 202510, 202510, 202510, 202510…
## $ cbsa_code                               <dbl> 35620, 31080, 16980, 19100, 26…
## $ cbsa_title                              <chr> "New York-Newark-Jersey City, …
## $ HouseholdRank                           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
## $ median_listing_price                    <dbl> 762450, 1099000, 364900, 42500…
## $ median_listing_price_mm                 <dbl> 0.0039, 0.0000, -0.0200, 0.000…
## $ median_listing_price_yy                 <dbl> -0.0160, -0.0443, -0.0138, -0.…
## $ active_listing_count                    <dbl> 36156, 18409, 16751, 29448, 33…
## $ active_listing_count_mm                 <dbl> 0.0074, -0.0470, -0.0048, -0.0…
## $ active_listing_count_yy                 <dbl> 0.0388, 0.1787, -0.0175, 0.136…
## $ median_days_on_market                   <dbl> 54, 57, 38, 64, 57, 43, 86, 60…
## $ median_days_on_market_mm                <dbl> -0.0609, 0.0000, 0.0133, 0.015…
## $ median_days_on_market_yy                <dbl> 0.0093, 0.1875, 0.0270, 0.1327…
## $ new_listing_count                       <dbl> 13832, 7832, 8644, 8794, 9428,…
## $ new_listing_count_mm                    <dbl> -0.1703, -0.0046, -0.1044, -0.…
## $ new_listing_count_yy                    <dbl> 0.0716, 0.0158, -0.0476, -0.04…
## $ price_increased_count                   <dbl> 362, 330, 160, 448, 1304, 266,…
## $ price_increased_count_mm                <dbl> 0.0838, 0.1074, -0.2308, -0.05…
## $ price_increased_count_yy                <dbl> -0.1084, 0.0123, -0.3043, -0.2…
## $ price_increased_share                   <dbl> 0.0046, 0.0100, 0.0043, 0.0094…
## $ price_increased_share_mm                <dbl> 0.0002, 0.0015, -0.0011, -0.00…
## $ price_increased_share_yy                <dbl> -0.0016, -0.0006, -0.0019, -0.…
## $ price_reduced_count                     <dbl> 7062, 5260, 6308, 13318, 10622…
## $ price_reduced_count_mm                  <dbl> 0.0282, -0.0187, 0.0003, -0.00…
## $ price_reduced_count_yy                  <dbl> 0.2059, 0.2233, 0.0805, 0.1428…
## $ price_reduced_share                     <dbl> 0.0964, 0.1547, 0.1704, 0.2803…
## $ price_reduced_share_mm                  <dbl> 0.0054, 0.0023, 0.0067, 0.0068…
## $ price_reduced_share_yy                  <dbl> 0.0064, 0.0146, 0.0124, 0.0206…
## $ pending_listing_count                   <dbl> 23266, 7260, 11372, 9225, 9594…
## $ pending_listing_count_mm                <dbl> -0.0015, -0.0285, -0.0487, -0.…
## $ pending_listing_count_yy                <dbl> 0.3075, -0.0204, 0.0441, -0.05…
## $ median_listing_price_per_square_foot    <dbl> 515, 662, 211, 202, 172, 230, …
## $ median_listing_price_per_square_foot_mm <dbl> 0.0174, -0.0046, -0.0089, -0.0…
## $ median_listing_price_per_square_foot_yy <dbl> -0.0346, -0.0232, 0.0000, -0.0…
## $ median_square_feet                      <dbl> 1498, 1747, 1727, 2166, 2097, …
## $ median_square_feet_mm                   <dbl> -0.0013, 0.0023, -0.0089, -0.0…
## $ median_square_feet_yy                   <dbl> -0.0360, -0.0199, -0.0265, 0.0…
## $ average_listing_price                   <dbl> 1580160, 2274862, 526075, 6210…
## $ average_listing_price_mm                <dbl> 0.0214, 0.0052, -0.0146, 0.010…
## $ average_listing_price_yy                <dbl> -0.0468, -0.0408, -0.0298, 0.0…
## $ total_listing_count                     <dbl> 58998, 25629, 28103, 38647, 43…
## $ total_listing_count_mm                  <dbl> 0.0050, -0.0432, -0.0209, -0.0…
## $ total_listing_count_yy                  <dbl> 0.1307, 0.1152, 0.0081, 0.0828…
## $ pending_ratio                           <dbl> 0.643, 0.394, 0.679, 0.313, 0.…
## $ pending_ratio_mm                        <dbl> -0.0057, 0.0075, -0.0314, -0.0…
## $ pending_ratio_yy                        <dbl> 0.1323, -0.0802, 0.0400, -0.06…
## $ quality_flag                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#filtering for 2024
inventory_2024 <- inventory_raw %>%
  filter(month_date_yyyymm >= 202401 & month_date_yyyymm <= 202412)

#filtered for target vars
inventory_2024_filtered <- inventory_2024%>%
  select(
    month_date_yyyymm,
    cbsa_code,
    cbsa_title,
    median_listing_price_per_square_foot,
    total_listing_count,
    pending_ratio,
    median_days_on_market)%>%
  mutate(
    log_median_price_sqft = log(median_listing_price_per_square_foot)) %>%
  drop_na()

summary(inventory_2024_filtered)
##  month_date_yyyymm   cbsa_code      cbsa_title       
##  Min.   :202401    Min.   :10100   Length:11052      
##  1st Qu.:202404    1st Qu.:20100   Class :character  
##  Median :202407    Median :29860   Mode  :character  
##  Mean   :202406    Mean   :29857                     
##  3rd Qu.:202410    3rd Qu.:39700                     
##  Max.   :202412    Max.   :49820                     
##  median_listing_price_per_square_foot total_listing_count pending_ratio  
##  Min.   :  48                         Min.   :   17       Min.   :0.002  
##  1st Qu.: 134                         1st Qu.:  136       1st Qu.:0.318  
##  Median : 169                         Median :  284       Median :0.490  
##  Mean   : 202                         Mean   : 1255       Mean   :0.553  
##  3rd Qu.: 224                         3rd Qu.:  776       3rd Qu.:0.719  
##  Max.   :1708                         Max.   :52936       Max.   :2.679  
##  median_days_on_market log_median_price_sqft
##  Min.   :  9.0         Min.   :3.87         
##  1st Qu.: 46.0         1st Qu.:4.90         
##  Median : 58.0         Median :5.13         
##  Mean   : 60.1         Mean   :5.19         
##  3rd Qu.: 72.0         3rd Qu.:5.41         
##  Max.   :164.0         Max.   :7.44
# inventory_2024_grouped <- inventory_2024_filtered %>%
#   select(!month_date_yyyymm) %>%
#   group_by(cbsa_code, cbsa_title)%>%
#   summarise(
#     med_median_listing_price_per_square_foot = median(median_listing_price_per_square_foot),
#     # avg_log_median_listing_price = mean(log_median_listing_price, na.rm = TRUE),
#     med_total_listing_count = median(total_listing_count),
#     med_pending_ratio = median(pending_ratio),
#     med_median_days_on_market = median(median_days_on_market)) %>%
#   ungroup()
# 
# print(inventory_2024_grouped)

The Realtor.com inventory dataset provides monthly metro-level housing indicators like listing counts, median prices, and days on market. Unlike the ACS, it doesn’t include paired estimates or margins of error. Each variable is already given as a direct point estimate. Some fields (the _mm and _yy columns) represent month-over-month or year-over-year percent changes, which introduce redundancy, so we later drop those before modeling. Aside from that, the data only need to be filtered to the 2024 months and are ready for use.

#cbsa code var to chr so merge works
inventory_2024_filtered <- inventory_2024_filtered %>%
  mutate(cbsa_code = as.character(cbsa_code))

#merge ACS and inventory datasets
full_dataset <- acs1_2024 %>%
  left_join(inventory_2024_filtered, by = c("GEOID" = "cbsa_code"))

#which vars to factor
factor_vars <- c("GEOID",
                 # "NAME",
                 "cbsa_title",
                 # "HouseholdRank",
                 # "quality_flag",
                 "area_type")

#drop redundant columns
redundant_vars <- c("NAME")

#clean dataset
full_dataset_clean <- full_dataset %>% 
  select(-all_of(redundant_vars)) %>% # Drop redundant columns
  mutate(across(all_of(factor_vars), as.factor)) #%>% # Convert factor variables
  # mutate(log_median_listing_price = log(median_listing_price)) # Log home price

#NAs by variable type
na_summary_one <- full_dataset_clean %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "NA_count") %>%
  mutate(var_type = ifelse(variable %in% factor_vars, "factor", "numeric")) %>%
  arrange(desc(NA_count))

print(na_summary_one)
## # A tibble: 55 × 3
##    variable               NA_count var_type
##    <chr>                     <int> <chr>   
##  1 pct_white                    60 numeric 
##  2 pct_black                    60 numeric 
##  3 pct_aian                     60 numeric 
##  4 pct_asian                    60 numeric 
##  5 pct_nhpi                     60 numeric 
##  6 pct_two_or_more              60 numeric 
##  7 total_households              6 numeric 
##  8 pct_married_couple            6 numeric 
##  9 pct_male_householder          6 numeric 
## 10 pct_female_householder        6 numeric 
## # ℹ 45 more rows
#address NAs (drop hispanic, when organized by FIPS code, nearest neighbor for ages, replace remaining NAs with 0)
#drop hispanic
#commenting out for now since some of this was done earlier and would like to convene before moving/removing
# full_dataset_clean <- full_dataset_clean %>%
#   select(-pct_hispanic)
# 
# #when organized by FIPS code, nearest neighbor for ages
# age_vars <- c("pct_under_18", "pct_65_over")
# 
# full_dataset_clean <- full_dataset_clean %>%
#   arrange(GEOID, month_date_yyyymm) %>%
#   group_by(GEOID) %>%
#   mutate(across(all_of(age_vars), 
#                 ~ if(sum(!is.na(.)) >= 2) {
#                     approx(x = seq_along(.), 
#                            y = ., 
#                            xout = seq_along(.), 
#                            method = "linear", 
#                            rule = 2)$y
#                   } else {
#                     .  # leave as-is (NA)
#                   })) %>%
#   ungroup()
# 
# na_counts_two <- full_dataset_clean %>%
#   summarise(across(everything(), ~ sum(is.na(.)))) %>%
#   pivot_longer(everything(), names_to = "variable", values_to = "na_count") %>%
#   arrange(desc(na_count))
# 
# na_counts_two
#remove remaining NAs
full_dataset_clean <- full_dataset_clean %>%
  drop_na()
#got rid of 86 observations, but we still have over 6k
nrow(full_dataset_clean)
## [1] 6280

After merging the data to create our holistic dataset, we only dropped 86 rows to remove NA values. This left us with 6,280 observations remaining- more than enough to run a reliable analysis. To prepare for our analysis, we started by creating a summary table of key metrics around some of our key variables.

summary_long <- full_dataset_clean %>%
  summarise(median_price = median(median_listing_price_per_square_foot, na.rm = TRUE),
    mean_price   = mean(median_listing_price_per_square_foot, na.rm = TRUE),
    sd_price     = sd(median_listing_price_per_square_foot, na.rm = TRUE),
    median_supply = median(total_listing_count, na.rm = TRUE),
    mean_supply   = mean(total_listing_count, na.rm = TRUE),
    sd_supply     = sd(total_listing_count, na.rm = TRUE),
    median_income = median(median_household_income, na.rm = TRUE),
    mean_income   = mean(median_household_income, na.rm = TRUE),
    sd_income     = sd(median_household_income, na.rm = TRUE),
    median_pending_ratio = median(pending_ratio, na.rm = TRUE),
    mean_pending_ratio   = mean(pending_ratio, na.rm = TRUE),
    sd_pending_ratio     = sd(pending_ratio, na.rm = TRUE)) %>%
  pivot_longer(everything(), names_to = "Metric", values_to = "Value")

summary_table_kable <- kable(summary_long, caption = "Summary Statistics") %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover"))

summary_table_kable
Summary Statistics
Metric Value
median_price 187.000
mean_price 219.229
sd_price 129.567
median_supply 626.000
mean_supply 2089.634
sd_supply 4826.752
median_income 71632.000
mean_income 73933.927
sd_income 15025.725
median_pending_ratio 0.538
mean_pending_ratio 0.599
sd_pending_ratio 0.336

Even before graphing and investigating our dataset further with exploratory data analysis (EDA), the summary statistics tell us that all of our key variables have a right-skew. For price, income, and pending ratio, this is a slight right-skew, likely as a result of a few, large outliers. However, for our supply variable, there is an extreme right-skew and a very large standard deviation. This is likely due to the structure of the variable instead of a fundamental error in the dataset, and the supply distribution plot in the EDA section will add further context to this variable summary.

Exploratory Data Analysis

As this is a new dataset to our team, we needed to start our project with some high-level exploratory data analysis. This data analysis helped us frame our approach to our statistical analysis and predictive models later in the project. For a few of our variables, we graphed both the raw variable distributions and also logged versions of these variables to ensure that we can capture larger relationships independent of influential outliers.

price_dist_plt= ggplot(inventory_2024_filtered, aes(x = median_listing_price_per_square_foot)) +
  geom_histogram(binwidth = 50, fill = "steelblue", color = "black") +
  labs(
    title = "Distribution of Median Listing Price per Square Foot",
    subtitle = "Median Price per Square Foot is heavily right-skewed",
    x = "Median Price per Square Foot",
    y = "Count",
    caption = "SOURCE: Realtor.com") +
  scale_x_continuous(labels= dollar_format(accuracy=1))+
  theme_minimal(base_size = 11)+
  theme(plot.title=element_text(face="bold",size=14,margin=ggplot2::margin(b=3)),
  plot.subtitle = element_text(size=10,color="gray25",margin=ggplot2::margin(b=8)),
  plot.caption  = element_text(size=8,color="gray40",hjust=0),
  axis.title    = element_text(size=10,face="bold"),
  axis.text     = element_text(size=9),
  panel.grid.minor = element_blank(),
  panel.background  = element_blank(),
  plot.background   = element_blank())

ggsave(filename = "price_distribution.png",   
  plot = price_dist_plt,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

price_dist_plt

The first distribution we plotted was median price per square foot- our main dependent variable. The graph shows that a majority of median listing prices per square foot fall below $500 with a handful of very high values pulling the mean of the graph higher than the median.

logged_price_dist= ggplot(inventory_2024_filtered, aes(x = log_median_price_sqft)) +
  geom_histogram(binwidth = 0.1, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Log Median Listing Price per Square Foot",
    subtitle = "Log transformation reduces right skew and reveals the underlying distribution",
    x = "Log Median Price per Square Foot",
    y = "Count",
    caption = "SOURCE: Realtor.com") +
  scale_x_continuous(
    labels = function(x) dollar(exp(x), accuracy = 1)) +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face = "bold", size = 14, margin = ggplot2::margin(b = 3)),
    plot.subtitle = element_text(size = 10, color = "gray25", margin = ggplot2::margin(b = 8)),
    plot.caption  = element_text(size = 8, color = "gray40", hjust = 0),
    axis.title    = element_text(size = 10, face = "bold"),
    axis.text     = element_text(size = 9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

ggsave(filename = "logged_price_distribution.png",
  plot = logged_price_dist,
  width = 8,
  height = 5,
  dpi = 300)

logged_price_dist

Plotting the logged median price per square foot shows a distribution much closer to the normal curve. Logging the price mitigated the heavy influence of of outlier values and creates a more robust source from which to draw relationship insights between variables.

housing_violion<- ggplot(inventory_2024_filtered, aes(x = "", y = total_listing_count)) +
  geom_violin(fill = "lightblue", color = "steelblue", alpha = 0.7) +
  geom_jitter(width = 0.1, alpha = 0.3, color = "darkblue") +
  labs(title = "Distribution of Total Listing Count",
    subtitle = "Before transformation, the distribution is heavily centered at the lower price range",
    x = "",
    y = "Total Listings",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

housing_violion

This graph attempts to draw further insights into the distribution of listing counts among our cbsa units, but the concentration falling so heavily near the bottom of the y-axis makes it difficult to specifically interpret the supply distribution. To address this, we also plotted the log of supply below.

log_housing_violin_plt= ggplot(inventory_2024_filtered, aes(x = "", y = total_listing_count)) +
  geom_violin(fill = "lightblue", color = "steelblue", alpha = 0.7) +
  geom_jitter(width = 0.1, alpha = 0.3, color = "darkblue") +
  scale_y_log10() +
  labs(title = "Distribution of Total Listing Count (Log Scale)",
    subtitle = "The distribution density of total listing count is more easily visualized\nwith a log transformation and violin plot",
    x = "",
    y = "Log of Total Listings",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

ggsave(filename = "logged_supply_violin.png",   
  plot = log_housing_violin_plt,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)
log_housing_violin_plt

This graph shows the log distribution of listing supply overlayed on top of a violin plot of the same data. This added dimension and logged variable adds much more insight into the distribution of the supply variable. As the graph demonstrates, a majority of the supply variables land near the bottom of the y-axis, indicating that there are vastly more cbsa with low listing counts than high listing counts. This is likely due to the structure of our cbsa data. Micro areas can be cbsa regions with 10,000-50,000 residents, and macro areas are any over 50,000 residents. Most small towns fall in the micro level, whereas some massive metropolitan areas (like New York, Chicago, Los Angeles) have millions of residents. This leads to a vast majority of the units falling on the lower end of the spectrum with large, meaninful outliers near the top. For further graphs, we also plotted the log of supply to help normalize this distribution and draw out the true nature of the relationships between variables.

price_supply_scatter= ggplot(inventory_2024_filtered, aes(x = total_listing_count, y = median_listing_price_per_square_foot)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  labs(title = "Price vs Total Supply",
       subtitle = "Median price per square foot tends to increase as total listings increase",
       x = "Total Listing Count",
       y = "Median Price per Square Foot",
       caption = "SOURCE: Realtor.com") +
  scale_y_continuous(labels = dollar_format(accuracy = 1)) +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
        plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
        plot.caption  = element_text(size=8, color="gray40", hjust=0),
        axis.title    = element_text(size=10, face="bold"),
        axis.text     = element_text(size=9),
        panel.grid.minor = element_blank(),
        panel.background  = element_blank(),
        plot.background   = element_blank())

ggsave(filename = "pricesupply_scatter.png",   
  plot = price_supply_scatter,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

price_supply_scatter

This graph plots the relationship between the un-transformed price and supply variables as this is the key relationship driving our SMART question. This plot shows how price appears to be positively correlated with total listing count, but we procedded with log transformations of the variables to be sure.

log_price_supply_scatter <- ggplot(inventory_2024_filtered, aes(x = total_listing_count, y = log_median_price_sqft)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  labs(title = "Log Price vs Total Supply",
       subtitle = "Log transformation reduces skew and reveals variation across listings",
       x = "Total Listing Count",
       y = "Median Price per Square Foot (log scale)",
       caption = "SOURCE: Realtor.com") +
  scale_y_continuous(labels = function(x) dollar(exp(x), accuracy = 1)) +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
        plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
        plot.caption  = element_text(size=8, color="gray40", hjust=0),
        axis.title    = element_text(size=10, face="bold"),
        axis.text     = element_text(size=9),
        panel.grid.minor = element_blank(),
        panel.background  = element_blank(),
        plot.background   = element_blank())

log_price_supply_scatter

After plotting the lof of our price variable against our supply variable, the positive relationship between price and supply appears even stronger than in our previous graph. However, the distribution of supply is still heavily skewed, so we also logged that variable for the next graph.

#log by log scatter with a regression line on top and the actual values associated with the log transformation
log_price_log_supply_scatter<- ggplot(inventory_2024_filtered,
       aes(x = log(total_listing_count),
           y = log(median_listing_price_per_square_foot))) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  scale_x_continuous(name = "Total Listing Count (log scale)",
    labels = function(x) scales::comma(round(exp(x)))) +
  scale_y_continuous(
    name = "Median Price per Square Foot (log scale)",
    labels = function(y) scales::dollar(round(exp(y), 0))) +
  labs(title = "Log Price vs Log Total Supply",
    subtitle = "Higher listing counts are generally associated with higher listing prices",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

log_price_log_supply_scatter

In this final version of our price/ supply scatterplot, we compare the log of price with the log of supply, and we see a clear positive trend between price and supply. Log of median price per square foot steadily increases as log of total listing count increases, supporting the idea that changes in housing supply- at least partially- influence changes in housing price.

logprice_logsupply_scatter= inventory_2024_filtered %>%
  mutate(listing_bin = ntile(total_listing_count, 5)) %>%
  ggplot(aes(x = factor(listing_bin), y = log_median_price_sqft)) +
  geom_boxplot(fill = "lightblue", color = "steelblue") +
  scale_y_continuous(
    name = "Median Price per Square Foot (log scale)",
    labels = function(y) scales::dollar(round(exp(y), 0))) +
  labs(
    title = "Price Distribution by Supply Quantiles",
    subtitle= "Higher median log price is associated with higher supply",
    x = "Supply Quantile (Total Listings)",
    y = "Median Price per Sq Ft (log)",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9))

ggsave(filename = "logpricelogsupply_scatter.png",   
  plot = logprice_logsupply_scatter,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

logprice_logsupply_scatter

When separating unlogged supply into quantiles for a boxplot, the positive correlation between these two variables remains plainly visible.

Price_demand_scatter= ggplot(inventory_2024_filtered, aes(x = pending_ratio, y = median_listing_price_per_square_foot)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  scale_y_continuous(labels = dollar_format(accuracy = 1)) +
  labs(title = "Price vs Market Tightness (Demand)",
    subtitle = "Higher market demand is associated with lower prices",
    x = "Pending Ratio (Pending / Active Listings)",
    y = "Median Price per Square Foot",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

ggsave(filename = "pricedemand_scatter.png",   
  plot = Price_demand_scatter,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

Price_demand_scatter

The next relationship we investigated was the relationship between housing price and market demand. As is clean in the graph above, plotting median listing price per square foot against pending ratio (our proxy for market demand), reveals a slight negative relationship between price and demand. This is in line with traditional models of supply and demand where where supply is held constant, and higher demand is associated with lower market prices.

logprice_demand_scatter= ggplot(inventory_2024_filtered, aes(x = pending_ratio, y = log_median_price_sqft)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  scale_y_continuous(
    labels = function(x) dollar(exp(x), accuracy = 1)) +
  labs(title = "Log Price vs Market Tightness (Demand)",
    subtitle = "Higher market demand is associated with lower log prices",
    x = "Pending Ratio (Pending / Active Listings)",
    y = "Median Price per Square Foot (log scale)",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

ggsave(filename = "logpricedemand_scatter.png",   
  plot = logprice_demand_scatter,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

logprice_demand_scatter

We also graphed the relationship between log price and market demand, and the negative correlation between these relationships persists.

price_market_hotness_plot <-ggplot(inventory_2024_filtered, aes(x = median_days_on_market, y = median_listing_price_per_square_foot)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  scale_y_continuous(labels = dollar_format(accuracy = 1)) +
  labs(title = "Price vs Market Speed",
    subtitle = "The relationship between price and market speed is minimal",
    x = "Median Days on Market",
    y = "Median Price per Square Foot",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

price_market_hotness_plot

The next set of variables that we investigated in our exploratory data analysis was price and market speed. The graph above shows a slight positive relationship between price and market speed (represented with the median days on market variable), although the magnitude of the relationship appears small. Additionally, we cannot claim causality in this relationship as the risk of reverse causality is very high between these variables. We would require a much more thorough study to investigate any causal relationship between these variables.

log_price_market_hotness_plot<- ggplot(inventory_2024_filtered, aes(x = median_days_on_market, y = log_median_price_sqft)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
  scale_y_continuous(labels = function(x) dollar(exp(x), accuracy = 1)) +
  labs(title = "Log Price vs Market Speed",
    subtitle = "The relationship between log price and market speed is also minimal",
    x = "Median Days on Market",
    y = "Median Price per Square Foot (log scale)",
    caption = "SOURCE: Realtor.com") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9),
    panel.grid.minor = element_blank(),
    panel.background  = element_blank(),
    plot.background   = element_blank())

log_price_market_hotness_plot

When we graphed the log price against the market speed, the direction of the correlation changed. Instead of having a slightly negative relationship, there now appears to be a very very slight positive relationship between price and market speed. This graph simply adds complexity to the relationship between these two variables- if there is a noteworthy relationship between them at all.

price_income_scatter= ggplot(full_dataset_clean, aes(x = median_household_income, 
                               y = median_listing_price_per_square_foot)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred", linewidth = 1) +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_continuous(labels = scales::dollar) +
  labs(title = "Median Price vs Median Household Income",
    subtitle = "There appears to be a strong positive relationship between median\nlisting price and median household income",
    x = "Median Household Income",
    y = "Median Price per Sq Ft",
    caption = "SOURCE: Realtor.com & ACS 2024") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9))

ggsave(filename = "priceincome_scatter.png",   
  plot = price_income_scatter,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

price_income_scatter

The final set of variables we analyzed was median price per square foot and median household income. We expected a strong, positive relationship between these variables, and as is plainly visible in the graph above, that is exactly what we found. Again, this is in line with classic models of supply and demand. However, neither of these variables have normal distributions, so we also compared their log transformations.

logprice_logincome_scatter= ggplot(full_dataset_clean) +
  geom_point(aes(x = log(median_household_income), 
                 y = log(median_listing_price_per_square_foot)),
             alpha = 0.6, color = "steelblue") +
  geom_smooth(aes(x = log(median_household_income), 
                  y = log(median_listing_price_per_square_foot)),
              method = "lm", se = TRUE, color = "darkred", linewidth = 1) +
  scale_x_continuous(name = "Median Household Income (log scale)",
    labels = function(x) scales::dollar(exp(x))) +
  scale_y_continuous(name = "Median Price per Sq Ft (log scale)",
    labels = function(y) scales::dollar(exp(y))) +
  labs(title = "Log Price vs Log Median Household Income",
    subtitle = "Standardizing both variables only emphasizes the relationship",
    caption = "SOURCE: Realtor.com & ACS 2024") +
  theme_minimal(base_size = 11) +
  theme(plot.title    = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
    plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
    plot.caption  = element_text(size=8, color="gray40", hjust=0),
    axis.title    = element_text(size=10, face="bold"),
    axis.text     = element_text(size=9))

ggsave(filename = "logpricelogincome_scatter.png",   
  plot = logprice_logincome_scatter,                 
  width = 8,                             
  height = 5,                            
  dpi = 300)

logprice_logincome_scatter

When plotting log price and log income, the positive relationship between the variables is of an even larger magnitude than the un-transformed versions of the variables. This demonstrates a clear and strong relationship and the potential significance of income as a predictor in our models.

Modeling

EDA cannot establish causality by itself, but it does provide a strong starting point for more complex statistical testing of the data.

Split the Data into train and test

set.seed(4)

targets = c("median_listing_price_per_square_foot", "log_median_price_sqft")

split = sample.split(full_dataset_clean$GEOID, SplitRatio = 0.8)
train = subset(full_dataset_clean %>% select(-"GEOID"), split == TRUE)
test = subset(full_dataset_clean %>% select(-"GEOID"), split == FALSE)

# train
price_train = train$median_listing_price_per_square_foot
log_price_train = train$log_median_price_sqft

X_train = train %>% select(-all_of(targets))
X_train_matrix = model.matrix(~.-1,X_train)

X_train_no_cbsa = X_train %>% select(-cbsa_title)
X_train_no_cbsa_matrix = model.matrix(~.-1,X_train_no_cbsa)

# test
price_test = test$median_listing_price_per_square_foot
log_price_test = test$log_median_price_sqft

X_test = test %>% select(-all_of(targets))
X_test_matrix = model.matrix(~.-1,X_test)

X_test_no_cbsa = X_test %>% select(-cbsa_title)
X_test_no_cbsa_matrix = model.matrix(~.-1,X_test_no_cbsa)

Linear Regression

non-log

# names(full_dataset_clean)
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count" 
reg_model_by_1 <- lm(median_listing_price_per_square_foot ~ total_listing_count,
            data = train)

summary(reg_model_by_1)
## 
## Call:
## lm(formula = median_listing_price_per_square_foot ~ total_listing_count, 
##     data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -204.8  -68.2  -30.2   23.0 1065.0 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         209.640034   1.921284   109.1   <2e-16 ***
## total_listing_count   0.004556   0.000366    12.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 128 on 5231 degrees of freedom
## Multiple R-squared:  0.0288, Adjusted R-squared:  0.0286 
## F-statistic:  155 on 1 and 5231 DF,  p-value: <2e-16
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count" "area_type" "avg_pct_income_75k_plus" "avg_pct_units_built_2000_plus" "avg_pct_bachelors_or_higher" 
reg_model_by_all <- lm(median_listing_price_per_square_foot ~ .-cbsa_title-log_median_price_sqft,
            data = train)

summary(reg_model_by_all)
## 
## Call:
## lm(formula = median_listing_price_per_square_foot ~ . - cbsa_title - 
##     log_median_price_sqft, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -219.7  -32.1   -2.1   26.2  556.6 
## 
## Coefficients:
##                              Estimate    Std. Error t value Pr(>|t|)    
## (Intercept)             -8845.9248917 50189.5044941   -0.18  0.86010    
## total_population            0.0000709     0.0000119    5.98  2.4e-09 ***
## pct_female                  2.4375101     0.9224434    2.64  0.00826 ** 
## pct_white                  -3.3909536     0.4911432   -6.90  5.7e-12 ***
## pct_black                  -3.8852373     0.5029785   -7.72  1.3e-14 ***
## pct_aian                   -3.5247886     0.5352760   -6.58  5.0e-11 ***
## pct_asian                  -2.8157779     0.6672297   -4.22  2.5e-05 ***
## pct_nhpi                   22.4770681     1.6719943   13.44  < 2e-16 ***
## pct_two_or_more            -4.0802497     0.3878300  -10.52  < 2e-16 ***
## pct_hispanic                0.4671647     0.2961200    1.58  0.11471    
## median_age                  2.4508771     0.4051798    6.05  1.6e-09 ***
## pct_under_18               -4.1705049     0.6532789   -6.38  1.9e-10 ***
## total_households           -0.0009366     0.0001044   -8.97  < 2e-16 ***
## pct_married_couple         -4.4202959     0.6194868   -7.14  1.1e-12 ***
## pct_male_householder       -8.5572505     0.7520751  -11.38  < 2e-16 ***
## pct_female_householder     -3.7096113     0.7323602   -5.07  4.2e-07 ***
## pct_unemployed             -7.5248864     1.2340013   -6.10  1.2e-09 ***
## pct_below_poverty          -0.8776696     0.5437292   -1.61  0.10655    
## pct_income_10_14999         1.2672191     1.1189377    1.13  0.25747    
## pct_income_15_24999         6.6273718     1.0104158    6.56  5.9e-11 ***
## pct_income_25_34999         3.5914555     1.0064759    3.57  0.00036 ***
## pct_income_35_49999         4.5561182     0.8579653    5.31  1.1e-07 ***
## pct_income_50_74999         3.6882698     0.8404993    4.39  1.2e-05 ***
## pct_income_75_99999         0.0894807     1.0469556    0.09  0.93189    
## pct_income_100_149k         4.7500064     1.0177460    4.67  3.1e-06 ***
## pct_income_150_199k         1.9035372     1.1177579    1.70  0.08863 .  
## pct_income_200k_plus        8.3107327     1.2237206    6.79  1.2e-11 ***
## median_household_income     0.0042429     0.0003796   11.18  < 2e-16 ***
## pct_high_school_plus       -0.0673494     0.4530865   -0.15  0.88184    
## pct_bachelors_plus         -0.6412146     0.2524776   -2.54  0.01112 *  
## total_housing_units         0.0007263     0.0000966    7.52  6.4e-14 ***
## pct_vacant_units            3.8821105     0.2255146   17.21  < 2e-16 ***
## pct_owner_occupied         -3.9518493     0.2559362  -15.44  < 2e-16 ***
## pct_built_2020_plus         1.7264530     0.6248400    2.76  0.00575 ** 
## pct_built_2010_2019         0.6467341     0.3729643    1.73  0.08297 .  
## pct_built_2000_2009         1.7104181     0.3833736    4.46  8.3e-06 ***
## pct_built_1990_1999         5.4986155     0.3622425   15.18  < 2e-16 ***
## pct_built_1980_1989         0.1075156     0.3409086    0.32  0.75249    
## pct_built_1970_1979         4.4634395     0.3294739   13.55  < 2e-16 ***
## pct_built_1960_1969         3.2606558     0.4481913    7.28  4.0e-13 ***
## pct_built_1950_1959         1.4099698     0.4856007    2.90  0.00371 ** 
## pct_built_1940_1949         1.2691494     0.7324796    1.73  0.08321 .  
## pct_1_bed                  -2.9346380     0.9385550   -3.13  0.00178 ** 
## pct_2_bed                  -6.0047784     0.7778107   -7.72  1.4e-14 ***
## pct_3_bed                  -5.7643616     0.7913148   -7.28  3.7e-13 ***
## pct_4_bed                 -10.1503354     0.8250897  -12.30  < 2e-16 ***
## pct_5plus_bed              -4.8878482     0.8743573   -5.59  2.4e-08 ***
## area_typeMicro             20.8463748     2.4677002    8.45  < 2e-16 ***
## month_date_yyyymm           0.0479433     0.2479423    0.19  0.84668    
## total_listing_count        -0.0058566     0.0005451  -10.74  < 2e-16 ***
## pending_ratio             -12.2510452     3.1841257   -3.85  0.00012 ***
## median_days_on_market       0.3178351     0.0642984    4.94  7.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.7 on 5181 degrees of freedom
## Multiple R-squared:  0.789,  Adjusted R-squared:  0.787 
## F-statistic:  380 on 51 and 5181 DF,  p-value: <2e-16

Have to eliminate cbsa_title or we would overfit

log median listing price

# names(full_dataset_clean)
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count" 
reg_model_log_by_1 <- lm(log_median_price_sqft ~ total_listing_count,
            data = train)

summary(reg_model_log_by_1)
## 
## Call:
## lm(formula = log_median_price_sqft ~ total_listing_count, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.169 -0.274 -0.046  0.202  1.893 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.24253933 0.00624529   839.4   <2e-16 ***
## total_listing_count 0.00001945 0.00000119    16.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.414 on 5231 degrees of freedom
## Multiple R-squared:  0.0486, Adjusted R-squared:  0.0485 
## F-statistic:  267 on 1 and 5231 DF,  p-value: <2e-16

Nearly double the multiple R-squared.

#reg d- "avg_log_median_listing_price" i- "total_active_listing_count" "area_type" "avg_pct_income_75k_plus" "avg_pct_units_built_2000_plus" "avg_pct_bachelors_or_higher" 
reg_model_log_by_all <- lm(log_median_price_sqft ~ . -cbsa_title -median_listing_price_per_square_foot,
            data = train)

summary(reg_model_log_by_all)
## 
## Call:
## lm(formula = log_median_price_sqft ~ . - cbsa_title - median_listing_price_per_square_foot, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6310 -0.1124 -0.0033  0.1109  0.8750 
## 
## Coefficients:
##                                Estimate      Std. Error t value Pr(>|t|)    
## (Intercept)             -263.9506517811  147.1815778591   -1.79  0.07297 .  
## total_population           0.0000000435    0.0000000348    1.25  0.21117    
## pct_female                 0.0216150690    0.0027050809    7.99  1.6e-15 ***
## pct_white                 -0.0183854352    0.0014402858  -12.77  < 2e-16 ***
## pct_black                 -0.0220597433    0.0014749931  -14.96  < 2e-16 ***
## pct_aian                  -0.0154160110    0.0015697061   -9.82  < 2e-16 ***
## pct_asian                 -0.0197734811    0.0019566624  -10.11  < 2e-16 ***
## pct_nhpi                   0.0209086486    0.0049031519    4.26  2.0e-05 ***
## pct_two_or_more           -0.0152579070    0.0011373182  -13.42  < 2e-16 ***
## pct_hispanic              -0.0017765348    0.0008683769   -2.05  0.04083 *  
## median_age                 0.0083844151    0.0011881966    7.06  1.9e-12 ***
## pct_under_18              -0.0222256671    0.0019157515  -11.60  < 2e-16 ***
## total_households          -0.0000026152    0.0000003060   -8.55  < 2e-16 ***
## pct_married_couple        -0.0138774853    0.0018166557   -7.64  2.6e-14 ***
## pct_male_householder      -0.0281977982    0.0022054730  -12.79  < 2e-16 ***
## pct_female_householder    -0.0152457556    0.0021476587   -7.10  1.4e-12 ***
## pct_unemployed            -0.0044214173    0.0036187298   -1.22  0.22183    
## pct_below_poverty         -0.0064797076    0.0015944951   -4.06  4.9e-05 ***
## pct_income_10_14999        0.0156180661    0.0032813040    4.76  2.0e-06 ***
## pct_income_15_24999        0.0178654188    0.0029630616    6.03  1.8e-09 ***
## pct_income_25_34999        0.0169949240    0.0029515076    5.76  9.0e-09 ***
## pct_income_35_49999        0.0202519328    0.0025159978    8.05  1.0e-15 ***
## pct_income_50_74999        0.0187283084    0.0024647786    7.60  3.5e-14 ***
## pct_income_75_99999        0.0203351809    0.0030702150    6.62  3.9e-11 ***
## pct_income_100_149k        0.0353523476    0.0029845575   11.85  < 2e-16 ***
## pct_income_150_199k        0.0409031617    0.0032778441   12.48  < 2e-16 ***
## pct_income_200k_plus       0.0521975589    0.0035885816   14.55  < 2e-16 ***
## median_household_income    0.0000004796    0.0000011133    0.43  0.66666    
## pct_high_school_plus       0.0002452845    0.0013286839    0.18  0.85354    
## pct_bachelors_plus        -0.0012255201    0.0007403949   -1.66  0.09794 .  
## total_housing_units        0.0000024487    0.0000002832    8.65  < 2e-16 ***
## pct_vacant_units           0.0084538688    0.0006613254   12.78  < 2e-16 ***
## pct_owner_occupied        -0.0133510620    0.0007505373  -17.79  < 2e-16 ***
## pct_built_2020_plus        0.0182040448    0.0018323538    9.93  < 2e-16 ***
## pct_built_2010_2019        0.0025345517    0.0010937243    2.32  0.02052 *  
## pct_built_2000_2009        0.0079883490    0.0011242495    7.11  1.4e-12 ***
## pct_built_1990_1999        0.0201886413    0.0010622824   19.00  < 2e-16 ***
## pct_built_1980_1989        0.0061704866    0.0009997202    6.17  7.2e-10 ***
## pct_built_1970_1979        0.0167673307    0.0009661879   17.35  < 2e-16 ***
## pct_built_1960_1969        0.0052036062    0.0013143285    3.96  7.6e-05 ***
## pct_built_1950_1959        0.0049729331    0.0014240322    3.49  0.00048 ***
## pct_built_1940_1949        0.0079898015    0.0021480089    3.72  0.00020 ***
## pct_1_bed                 -0.0110120715    0.0027523286   -4.00  6.4e-05 ***
## pct_2_bed                 -0.0158731141    0.0022809431   -6.96  3.9e-12 ***
## pct_3_bed                 -0.0182188726    0.0023205442   -7.85  5.0e-15 ***
## pct_4_bed                 -0.0278601810    0.0024195897  -11.51  < 2e-16 ***
## pct_5plus_bed             -0.0139748305    0.0025640677   -5.45  5.3e-08 ***
## area_typeMicro             0.0745508272    0.0072365730   10.30  < 2e-16 ***
## month_date_yyyymm          0.0013381116    0.0007270949    1.84  0.06577 .  
## total_listing_count       -0.0000204797    0.0000015986  -12.81  < 2e-16 ***
## pending_ratio             -0.0504287754    0.0093375029   -5.40  6.9e-08 ***
## median_days_on_market      0.0001558163    0.0001885562    0.83  0.40864    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.175 on 5181 degrees of freedom
## Multiple R-squared:  0.832,  Adjusted R-squared:  0.83 
## F-statistic:  502 on 51 and 5181 DF,  p-value: <2e-16

Have to eliminate cbsa_title or we would overfit

Slightly better fit (also around 2% better, coincidentally)

LASSO

Price

cv_lasso_price = cv.glmnet(X_train_matrix, price_train, alpha = 1)

lasso_price_best_lambda = cv_lasso_price$lambda.min
lasso_price_best_lambda
## [1] 0.0221
plot(cv_lasso_price)

lasso_price_best_model = glmnet(X_train_matrix, price_train, alpha = 1, lambda=lasso_price_best_lambda)
coef(lasso_price_best_model)
## 581 x 1 sparse Matrix of class "dgCMatrix"
##                                                                 s0
## (Intercept)                                              -2.83e+04
## total_population                                          6.15e-06
## pct_female                                                1.25e+00
## pct_white                                                -1.51e+00
## pct_black                                                -2.02e+00
## pct_aian                                                  .       
## pct_asian                                                 7.66e+00
## pct_nhpi                                                  1.05e+01
## pct_two_or_more                                           .       
## pct_hispanic                                              8.87e-02
## median_age                                                3.75e+00
## pct_under_18                                             -1.34e+00
## total_households                                          2.44e-07
## pct_married_couple                                        4.18e-01
## pct_male_householder                                     -2.97e+00
## pct_female_householder                                   -2.60e-02
## pct_unemployed                                            1.11e+00
## pct_below_poverty                                        -3.13e+00
## pct_income_10_14999                                      -1.68e+00
## pct_income_15_24999                                      -5.95e+00
## pct_income_25_34999                                      -2.96e+00
## pct_income_35_49999                                      -2.11e+00
## pct_income_50_74999                                      -3.41e+00
## pct_income_75_99999                                      -2.95e+00
## pct_income_100_149k                                       1.15e+00
## pct_income_150_199k                                       7.11e-02
## pct_income_200k_plus                                      1.28e+00
## median_household_income                                   .       
## pct_high_school_plus                                      .       
## pct_bachelors_plus                                        .       
## total_housing_units                                      -1.69e-05
## pct_vacant_units                                          2.63e+00
## pct_owner_occupied                                       -1.76e+00
## pct_built_2020_plus                                       1.58e+00
## pct_built_2010_2019                                       1.00e-02
## pct_built_2000_2009                                       7.01e-01
## pct_built_1990_1999                                       1.59e+00
## pct_built_1980_1989                                      -7.73e-03
## pct_built_1970_1979                                       2.33e+00
## pct_built_1960_1969                                       .       
## pct_built_1950_1959                                       .       
## pct_built_1940_1949                                      -1.66e+00
## pct_1_bed                                                 2.36e-01
## pct_2_bed                                                 1.91e+00
## pct_3_bed                                                -4.16e-02
## pct_4_bed                                                -3.92e+00
## pct_5plus_bed                                             1.16e+00
## area_typeMetro                                           -6.55e+00
## area_typeMicro                                            8.18e-14
## month_date_yyyymm                                         1.42e-01
## cbsa_titleAbilene, TX                                    -6.82e-01
## cbsa_titleAdrian, MI                                      3.75e+00
## cbsa_titleAkron, OH                                      -2.99e+01
## cbsa_titleAlamogordo, NM                                 -6.33e+00
## cbsa_titleAlbany-Schenectady-Troy, NY                    -9.66e+00
## cbsa_titleAlbany, GA                                     -3.53e+01
## cbsa_titleAlbany, OR                                      8.39e+01
## cbsa_titleAlbemarle, NC                                   3.32e+01
## cbsa_titleAlbertville, AL                                 1.34e+01
## cbsa_titleAlbuquerque, NM                                -2.00e+01
## cbsa_titleAlexandria, LA                                 -3.12e+00
## cbsa_titleAllentown-Bethlehem-Easton, PA-NJ              -1.11e+01
## cbsa_titleAltoona, PA                                    -5.98e+01
## cbsa_titleAmarillo, TX                                   -3.50e+01
## cbsa_titleAmes, IA                                        3.91e+01
## cbsa_titleAmherst Town-Northampton, MA                    3.62e+01
## cbsa_titleAnchorage, AK                                  -1.13e+02
## cbsa_titleAnderson Creek, NC                              3.53e+01
## cbsa_titleAnn Arbor, MI                                  -4.31e+01
## cbsa_titleAnniston-Oxford, AL                            -2.82e+01
## cbsa_titleAppleton, WI                                    1.71e+01
## cbsa_titleAsheville, NC                                   4.33e+01
## cbsa_titleAthens-Clarke County, GA                        5.27e+00
## cbsa_titleAthens, TN                                      .       
## cbsa_titleAthens, TX                                     -6.67e+00
## cbsa_titleAtlanta-Sandy Springs-Roswell, GA              -5.70e+01
## cbsa_titleAtlantic City-Hammonton, NJ                     6.75e+01
## cbsa_titleAuburn-Opelika, AL                              1.15e+00
## cbsa_titleAuburn, NY                                     -2.26e+01
## cbsa_titleAugusta-Richmond County, GA-SC                 -1.79e-01
## cbsa_titleAugusta-Waterville, ME                          3.04e+00
## cbsa_titleAustin-Round Rock-San Marcos, TX               -5.68e+01
## cbsa_titleBakersfield-Delano, CA                          5.88e-01
## cbsa_titleBaltimore-Columbia-Towson, MD                  -3.69e+01
## cbsa_titleBangor, ME                                     -1.52e+01
## cbsa_titleBaraboo, WI                                    -1.88e+01
## cbsa_titleBarnstable Town, MA                             1.98e+02
## cbsa_titleBaton Rouge, LA                                 2.25e+01
## cbsa_titleBattle Creek, MI                                9.40e+00
## cbsa_titleBay City, MI                                    9.22e+00
## cbsa_titleBeaumont-Port Arthur, TX                       -4.48e+01
## cbsa_titleBeaver Dam, WI                                 -6.11e+00
## cbsa_titleBeckley, WV                                     1.14e+01
## cbsa_titleBellingham, WA                                  7.94e+01
## cbsa_titleBend, OR                                        4.95e+01
## cbsa_titleBillings, MT                                    2.04e+01
## cbsa_titleBinghamton, NY                                 -1.80e+01
## cbsa_titleBirmingham, AL                                 -7.86e+00
## cbsa_titleBismarck, ND                                    1.47e+00
## cbsa_titleBlacksburg-Christiansburg-Radford, VA          -2.77e+01
## cbsa_titleBloomington, IL                                -6.70e+01
## cbsa_titleBloomington, IN                                -1.86e+01
## cbsa_titleBloomsburg-Berwick, PA                          2.08e+00
## cbsa_titleBluefield, WV-VA                               -3.46e+01
## cbsa_titleBoise City, ID                                  6.79e+01
## cbsa_titleBoston-Cambridge-Newton, MA-NH                  1.16e+02
## cbsa_titleBoulder, CO                                     1.32e+02
## cbsa_titleBowling Green, KY                              -1.97e+01
## cbsa_titleBozeman, MT                                     2.15e+02
## cbsa_titleBrainerd, MN                                    .       
## cbsa_titleBremerton-Silverdale-Port Orchard, WA          -2.86e+01
## cbsa_titleBridgeport-Stamford-Danbury, CT                 7.70e+01
## cbsa_titleBrigham City, UT-ID                             5.69e+01
## cbsa_titleBrownsville-Harlingen, TX                      -1.59e+01
## cbsa_titleBrunswick-St. Simons, GA                        3.51e+01
## cbsa_titleBuffalo-Cheektowaga, NY                         1.42e+00
## cbsa_titleBurlington-South Burlington, VT                 4.42e+01
## cbsa_titleBurlington, NC                                 -1.66e+01
## cbsa_titleCanton-Massillon, OH                           -1.88e+01
## cbsa_titleCape Coral-Fort Myers, FL                      -9.79e+01
## cbsa_titleCape Girardeau, MO-IL                          -4.04e+01
## cbsa_titleCarson City, NV                                -2.53e+01
## cbsa_titleCasper, WY                                      4.49e+01
## cbsa_titleCedar City, UT                                  4.31e+01
## cbsa_titleCedar Rapids, IA                               -7.62e+00
## cbsa_titleCentralia, WA                                   2.95e+01
## cbsa_titleChambersburg, PA                                .       
## cbsa_titleChampaign-Urbana, IL                           -8.71e+01
## cbsa_titleCharleston-North Charleston, SC                 5.82e+01
## cbsa_titleCharleston, WV                                 -5.64e+01
## cbsa_titleCharlotte-Concord-Gastonia, NC-SC              -2.29e+01
## cbsa_titleCharlottesville, VA                            -1.66e+00
## cbsa_titleChattanooga, TN-GA                              2.12e+01
## cbsa_titleCheyenne, WY                                    5.20e+00
## cbsa_titleChicago-Naperville-Elgin, IL-IN                -7.59e+01
## cbsa_titleChico, CA                                       4.49e+00
## cbsa_titleChillicothe, OH                                 .       
## cbsa_titleCincinnati, OH-KY-IN                            4.40e+00
## cbsa_titleClarksburg, WV                                 -1.58e+01
## cbsa_titleClarksville, TN-KY                              9.85e+00
## cbsa_titleClearlake, CA                                   2.91e+01
## cbsa_titleCleveland, OH                                  -1.17e+01
## cbsa_titleCleveland, TN                                  -2.99e+01
## cbsa_titleClovis, NM                                     -7.66e+00
## cbsa_titleCoeur d'Alene, ID                               1.09e+02
## cbsa_titleCollege Station-Bryan, TX                       6.63e+00
## cbsa_titleColorado Springs, CO                            1.21e+00
## cbsa_titleColumbia, MO                                   -1.75e+01
## cbsa_titleColumbia, SC                                    4.74e+00
## cbsa_titleColumbus, GA-AL                                -8.43e+00
## cbsa_titleColumbus, IN                                   -7.08e+01
## cbsa_titleColumbus, MS                                   -6.69e+00
## cbsa_titleColumbus, OH                                   -2.45e+01
## cbsa_titleConcord, NH                                     5.41e-01
## cbsa_titleCookeville, TN                                  3.20e+01
## cbsa_titleCorbin, KY                                      2.19e+01
## cbsa_titleCorning, NY                                    -6.14e+01
## cbsa_titleCorpus Christi, TX                             -2.27e+01
## cbsa_titleCorvallis, OR                                   3.77e+01
## cbsa_titleCrestview-Fort Walton Beach-Destin, FL          1.94e+02
## cbsa_titleCrossville, TN                                 -2.73e+01
## cbsa_titleCullman, AL                                     8.63e-01
## cbsa_titleCumberland, MD-WV                              -2.08e+01
## cbsa_titleDallas-Fort Worth-Arlington, TX                -8.93e+01
## cbsa_titleDalton, GA                                      2.43e+01
## cbsa_titleDanville, IL                                   -2.71e+01
## cbsa_titleDanville, VA                                   -3.97e+01
## cbsa_titleDaphne-Fairhope-Foley, AL                       1.06e+02
## cbsa_titleDavenport-Moline-Rock Island, IA-IL            -3.43e+01
## cbsa_titleDayton-Kettering-Beavercreek, OH               -1.50e+01
## cbsa_titleDecatur, AL                                    -2.97e+01
## cbsa_titleDecatur, IL                                    -4.95e+01
## cbsa_titleDeltona-Daytona Beach-Ormond Beach, FL         -1.97e+01
## cbsa_titleDenver-Aurora-Centennial, CO                    9.70e-01
## cbsa_titleDes Moines-West Des Moines, IA                  1.60e+01
## cbsa_titleDetroit-Warren-Dearborn, MI                    -3.87e+00
## cbsa_titleDothan, AL                                     -6.07e+00
## cbsa_titleDover, DE                                       1.34e+01
## cbsa_titleDuBois, PA                                     -1.40e+00
## cbsa_titleDubuque, IA                                     1.86e+01
## cbsa_titleDuluth, MN-WI                                   1.61e+01
## cbsa_titleDurham-Chapel Hill, NC                         -1.65e+01
## cbsa_titleEagle Pass, TX                                  .       
## cbsa_titleEast Stroudsburg, PA                           -2.77e+01
## cbsa_titleEau Claire, WI                                 -1.19e+01
## cbsa_titleEl Centro, CA                                  -3.05e-01
## cbsa_titleEl Paso, TX                                     9.07e-01
## cbsa_titleElizabethtown, KY                              -7.50e+00
## cbsa_titleElkhart-Goshen, IN                             -7.59e+00
## cbsa_titleElmira, NY                                     -1.62e+01
## cbsa_titleEnid, OK                                       -7.87e+01
## cbsa_titleErie, PA                                        1.65e+01
## cbsa_titleEugene-Springfield, OR                          6.45e+01
## cbsa_titleEureka-Arcata, CA                               8.27e+01
## cbsa_titleEvansville, IN                                 -3.13e+01
## cbsa_titleFairbanks-College, AK                          -1.03e+02
## cbsa_titleFargo, ND-MN                                   -1.07e+01
## cbsa_titleFaribault-Northfield, MN                        1.92e+01
## cbsa_titleFarmington, MO                                 -4.82e+00
## cbsa_titleFarmington, NM                                 -2.33e+01
## cbsa_titleFayetteville-Springdale-Rogers, AR             -5.89e+01
## cbsa_titleFayetteville, NC                                2.74e+01
## cbsa_titleFindlay, OH                                    -6.94e+01
## cbsa_titleFlagstaff, AZ                                   1.02e+02
## cbsa_titleFlint, MI                                       3.88e+00
## cbsa_titleFlorence-Muscle Shoals, AL                     -1.99e+01
## cbsa_titleFlorence, SC                                    6.87e+00
## cbsa_titleFond du Lac, WI                                -2.95e+00
## cbsa_titleForest City, NC                                 1.96e+01
## cbsa_titleFort Collins-Loveland, CO                       9.85e+00
## cbsa_titleFort Payne, AL                                  5.74e+01
## cbsa_titleFort Smith, AR-OK                              -4.01e+01
## cbsa_titleFort Wayne, IN                                  3.61e+00
## cbsa_titleFrankfort, KY                                  -8.43e+00
## cbsa_titleFresno, CA                                     -4.12e+01
## cbsa_titleGadsden, AL                                     1.23e+01
## cbsa_titleGainesville, FL                                 2.11e+01
## cbsa_titleGainesville, GA                                -2.27e+01
## cbsa_titleGallup, NM                                     -6.97e+01
## cbsa_titleGettysburg, PA                                 -1.01e+00
## cbsa_titleGlens Falls, NY                                 2.25e+01
## cbsa_titleGoldsboro, NC                                   3.75e+01
## cbsa_titleGranbury, TX                                    .       
## cbsa_titleGrand Forks, ND-MN                             -3.96e+01
## cbsa_titleGrand Island, NE                               -7.03e+01
## cbsa_titleGrand Junction, CO                              8.06e+01
## cbsa_titleGrand Rapids-Wyoming-Kentwood, MI               2.14e+01
## cbsa_titleGrants Pass, OR                                 4.88e+01
## cbsa_titleGreat Falls, MT                                 5.99e+01
## cbsa_titleGreeley, CO                                    -3.93e+00
## cbsa_titleGreen Bay, WI                                   6.57e+00
## cbsa_titleGreeneville, TN                                 4.43e+01
## cbsa_titleGreenfield, MA                                  .       
## cbsa_titleGreensboro-High Point, NC                      -1.56e+01
## cbsa_titleGreenville-Anderson-Greer, SC                  -3.28e+00
## cbsa_titleGreenville, NC                                 -1.95e+01
## cbsa_titleGreenwood, SC                                   5.43e+01
## cbsa_titleGulfport-Biloxi, MS                             .       
## cbsa_titleHagerstown-Martinsburg, MD-WV                   1.70e+01
## cbsa_titleHammond, LA                                     6.22e+01
## cbsa_titleHanford-Corcoran, CA                           -9.29e+00
## cbsa_titleHarrisburg-Carlisle, PA                        -5.14e+01
## cbsa_titleHarrisonburg, VA                                3.24e+01
## cbsa_titleHartford-West Hartford-East Hartford, CT       -3.26e+01
## cbsa_titleHattiesburg, MS                                 3.10e+01
## cbsa_titleHeber, UT                                       2.26e+02
## cbsa_titleHelena, MT                                      2.93e+01
## cbsa_titleHermiston-Pendleton, OR                         7.99e+00
## cbsa_titleHermitage, PA                                   1.96e+01
## cbsa_titleHickory-Lenoir-Morganton, NC                   -6.87e+00
## cbsa_titleHilo-Kailua, HI                                -1.31e+02
## cbsa_titleHilton Head Island-Bluffton-Port Royal, SC      5.71e+01
## cbsa_titleHinesville, GA                                  1.55e+01
## cbsa_titleHobbs, NM                                      -6.19e+01
## cbsa_titleHolland, MI                                     2.78e+01
## cbsa_titleHomosassa Springs, FL                          -1.08e+02
## cbsa_titleHot Springs, AR                                -1.93e+01
## cbsa_titleHouma-Bayou Cane-Thibodaux, LA                 -9.99e-01
## cbsa_titleHouston-Pasadena-The Woodlands, TX             -9.21e+01
## cbsa_titleHuntington-Ashland, WV-KY-OH                   -3.32e+01
## cbsa_titleHuntsville, AL                                  1.99e+00
## cbsa_titleHuntsville, TX                                  .       
## cbsa_titleIdaho Falls, ID                                -2.44e+01
## cbsa_titleIndiana, PA                                    -4.80e+01
## cbsa_titleIndianapolis-Carmel-Greenwood, IN              -4.30e+01
## cbsa_titleIowa City, IA                                  -3.14e+01
## cbsa_titleIthaca, NY                                     -5.12e+01
## cbsa_titleJackson, MI                                    -8.43e+00
## cbsa_titleJackson, MS                                     2.42e+01
## cbsa_titleJackson, TN                                     6.74e+01
## cbsa_titleJacksonville, FL                               -1.71e+00
## cbsa_titleJacksonville, NC                                2.95e+01
## cbsa_titleJamestown-Dunkirk, NY                          -1.73e+00
## cbsa_titleJanesville-Beloit, WI                           1.50e+01
## cbsa_titleJefferson City, MO                             -2.83e+00
## cbsa_titleJefferson, GA                                  -2.33e+01
## cbsa_titleJohnson City, TN                                3.64e+01
## cbsa_titleJohnstown, PA                                  -1.54e+01
## cbsa_titleJonesboro, AR                                  -2.41e+00
## cbsa_titleJoplin, MO-KS                                  -9.68e+00
## cbsa_titleKahului-Wailuku, HI                             4.13e+02
## cbsa_titleKalamazoo-Portage, MI                          -1.01e+01
## cbsa_titleKalispell, MT                                   1.95e+02
## cbsa_titleKankakee, IL                                   -4.06e+01
## cbsa_titleKansas City, MO-KS                             -3.31e+00
## cbsa_titleKapaa, HI                                       2.88e+02
## cbsa_titleKeene, NH                                      -2.25e+01
## cbsa_titleKennewick-Richland, WA                          4.62e+01
## cbsa_titleKenosha, WI                                    -2.10e+01
## cbsa_titleKey West-Key Largo, FL                          4.56e+02
## cbsa_titleKilleen-Temple, TX                              2.42e+01
## cbsa_titleKingsport-Bristol, TN-VA                       -9.53e+00
## cbsa_titleKingston, NY                                    4.53e+01
## cbsa_titleKiryas Joel-Poughkeepsie-Newburgh, NY           1.68e+01
## cbsa_titleKlamath Falls, OR                              -5.19e+00
## cbsa_titleKnoxville, TN                                   4.22e+01
## cbsa_titleKokomo, IN                                     -3.01e+01
## cbsa_titleLa Crosse-Onalaska, WI-MN                      -1.16e+01
## cbsa_titleLaconia, NH                                    -2.40e+01
## cbsa_titleLafayette-West Lafayette, IN                    1.42e+01
## cbsa_titleLafayette, LA                                   1.21e+01
## cbsa_titleLaGrange, GA-AL                                 1.92e+01
## cbsa_titleLake Charles, LA                               -1.26e+01
## cbsa_titleLake City, FL                                   3.43e+01
## cbsa_titleLake Havasu City-Kingman, AZ                   -1.40e+00
## cbsa_titleLakeland-Winter Haven, FL                      -1.25e+01
## cbsa_titleLancaster, PA                                   1.41e+00
## cbsa_titleLansing-East Lansing, MI                       -5.05e+01
## cbsa_titleLaredo, TX                                     -2.40e+01
## cbsa_titleLas Cruces, NM                                 -9.52e+00
## cbsa_titleLas Vegas-Henderson-North Las Vegas, NV        -8.45e+01
## cbsa_titleLaurel, MS                                      3.41e+01
## cbsa_titleLawrence, KS                                    .       
## cbsa_titleLawton, OK                                     -5.87e+01
## cbsa_titleLebanon-Claremont, NH-VT                        1.38e+00
## cbsa_titleLebanon, PA                                     2.86e+01
## cbsa_titleLewiston-Auburn, ME                             5.00e+01
## cbsa_titleLewiston, ID-WA                                 3.93e+01
## cbsa_titleLexington Park, MD                             -2.12e+01
## cbsa_titleLexington-Fayette, KY                          -3.97e+00
## cbsa_titleLima, OH                                       -1.24e+01
## cbsa_titleLincoln, NE                                    -1.71e+01
## cbsa_titleLittle Rock-North Little Rock-Conway, AR       -9.29e+00
## cbsa_titleLogan, UT-ID                                    4.63e+01
## cbsa_titleLongview-Kelso, WA                              6.50e+01
## cbsa_titleLongview, TX                                   -6.15e+00
## cbsa_titleLos Angeles-Long Beach-Anaheim, CA              2.14e+02
## cbsa_titleLouisville/Jefferson County, KY-IN             -1.37e+01
## cbsa_titleLubbock, TX                                    -4.75e+01
## cbsa_titleLufkin, TX                                     -1.68e+01
## cbsa_titleLumberton, NC                                  -1.34e+01
## cbsa_titleLynchburg, VA                                   2.00e+01
## cbsa_titleMacon-Bibb County, GA                          -2.73e+01
## cbsa_titleMadison, WI                                    -3.58e+01
## cbsa_titleManchester-Nashua, NH                          -1.27e+01
## cbsa_titleManhattan, KS                                  -6.19e+01
## cbsa_titleManitowoc, WI                                  -8.96e+00
## cbsa_titleMankato, MN                                    -1.48e+01
## cbsa_titleMansfield, OH                                   4.62e+00
## cbsa_titleMarinette, WI-MI                               -6.26e+01
## cbsa_titleMarion-Herrin, IL                              -8.83e+01
## cbsa_titleMarion, IN                                      5.05e+01
## cbsa_titleMarquette, MI                                  -1.40e+00
## cbsa_titleMassena-Ogdensburg, NY                         -2.64e+01
## cbsa_titleMcAllen-Edinburg-Mission, TX                   -3.43e+01
## cbsa_titleMeadville, PA                                  -2.67e+01
## cbsa_titleMedford, OR                                     4.58e+01
## cbsa_titleMemphis, TN-MS-AR                               2.06e+01
## cbsa_titleMerced, CA                                      2.02e+01
## cbsa_titleMeridian, MS                                   -4.68e+01
## cbsa_titleMiami-Fort Lauderdale-West Palm Beach, FL       4.47e+01
## cbsa_titleMichigan City-La Porte, IN                      .       
## cbsa_titleMidland, MI                                     1.78e+01
## cbsa_titleMidland, TX                                    -1.72e+01
## cbsa_titleMilwaukee-Waukesha, WI                         -1.42e+01
## cbsa_titleMinneapolis-St. Paul-Bloomington, MN-WI        -4.98e+01
## cbsa_titleMinot, ND                                      -3.70e+01
## cbsa_titleMissoula, MT                                    1.21e+02
## cbsa_titleMobile, AL                                      2.63e+01
## cbsa_titleModesto, CA                                     5.09e+00
## cbsa_titleMonroe, LA                                      9.42e+00
## cbsa_titleMonroe, MI                                      5.86e+01
## cbsa_titleMontgomery, AL                                  1.35e+01
## cbsa_titleMonticello, NY                                  1.02e+01
## cbsa_titleMorehead City, NC                               2.26e+01
## cbsa_titleMorgantown, WV                                 -1.59e+01
## cbsa_titleMorristown, TN                                  5.37e+01
## cbsa_titleMoses Lake, WA                                  3.21e+01
## cbsa_titleMount Airy, NC                                  1.44e+00
## cbsa_titleMount Pleasant, MI                             -1.77e+01
## cbsa_titleMount Vernon-Anacortes, WA                      5.20e+01
## cbsa_titleMuncie, IN                                     -2.30e+01
## cbsa_titleMurrells Inlet, SC                              4.54e+01
## cbsa_titleMuskegon-Norton Shores, MI                      7.37e+01
## cbsa_titleMuskogee, OK                                   -5.73e+01
## cbsa_titleMyrtle Beach-Conway-North Myrtle Beach, SC     -1.74e+01
## cbsa_titleNacogdoches, TX                                -1.05e+00
## cbsa_titleNapa, CA                                        3.02e+02
## cbsa_titleNaples-Marco Island, FL                        -1.76e-01
## cbsa_titleNashville-Davidson--Murfreesboro--Franklin, TN  2.64e+01
## cbsa_titleNew Bern, NC                                    1.52e+01
## cbsa_titleNew Haven, CT                                  -1.01e+01
## cbsa_titleNew Iberia, LA                                  6.66e+00
## cbsa_titleNew Orleans-Metairie, LA                        1.19e+01
## cbsa_titleNew Philadelphia-Dover, OH                     -2.31e+01
## cbsa_titleNew York-Newark-Jersey City, NY-NJ              1.75e+02
## cbsa_titleNiles, MI                                       3.58e+00
## cbsa_titleNorth Port-Bradenton-Sarasota, FL              -8.85e+01
## cbsa_titleNorth Wilkesboro, NC                            3.59e+01
## cbsa_titleNorwich-New London-Willimantic, CT              3.64e+00
## cbsa_titleOak Harbor, WA                                  2.17e+01
## cbsa_titleOcala, FL                                      -6.74e+01
## cbsa_titleOdessa, TX                                     -1.72e+01
## cbsa_titleOgden, UT                                       2.29e+00
## cbsa_titleOklahoma City, OK                              -1.67e+01
## cbsa_titleOlean, NY                                      -3.59e+00
## cbsa_titleOlympia-Lacey-Tumwater, WA                     -3.68e+01
## cbsa_titleOmaha, NE-IA                                   -2.27e+01
## cbsa_titleOpelousas, LA                                   4.97e+00
## cbsa_titleOrangeburg, SC                                 -2.79e+01
## cbsa_titleOrlando-Kissimmee-Sanford, FL                  -2.31e+01
## cbsa_titleOshkosh-Neenah, WI                             -7.08e+00
## cbsa_titleOttawa, IL                                     -3.43e+01
## cbsa_titleOwensboro, KY                                   1.25e+01
## cbsa_titleOwosso, MI                                     -1.31e+01
## cbsa_titleOxford, MS                                     -1.83e-01
## cbsa_titleOxnard-Thousand Oaks-Ventura, CA                1.80e+02
## cbsa_titlePaducah, KY-IL                                 -8.32e+01
## cbsa_titlePalatka, FL                                     5.21e+01
## cbsa_titlePalm Bay-Melbourne-Titusville, FL               2.66e-01
## cbsa_titlePanama City-Panama City Beach, FL               4.79e+01
## cbsa_titleParkersburg-Vienna, WV                         -3.04e+01
## cbsa_titlePensacola-Ferry Pass-Brent, FL                  2.92e+01
## cbsa_titlePeoria, IL                                     -6.53e+01
## cbsa_titlePhiladelphia-Camden-Wilmington, PA-NJ-DE-MD     .       
## cbsa_titlePhoenix-Mesa-Chandler, AZ                      -4.38e+00
## cbsa_titlePikeville, KY                                  -2.28e+01
## cbsa_titlePine Bluff, AR                                 -3.44e+01
## cbsa_titlePinehurst-Southern Pines, NC                    2.02e+01
## cbsa_titlePittsburgh, PA                                 -1.78e+01
## cbsa_titlePittsfield, MA                                  5.91e+01
## cbsa_titlePlattsburgh, NY                                -3.59e+01
## cbsa_titlePocatello, ID                                   2.02e+01
## cbsa_titlePort Angeles, WA                                3.91e+00
## cbsa_titlePort St. Lucie, FL                             -3.80e+01
## cbsa_titlePortland-South Portland, ME                     9.67e+01
## cbsa_titlePortland-Vancouver-Hillsboro, OR-WA            -1.85e+01
## cbsa_titlePortsmouth, OH                                  .       
## cbsa_titlePottsville, PA                                 -4.22e+01
## cbsa_titlePrescott Valley-Prescott, AZ                    1.79e+01
## cbsa_titleProvidence-Warwick, RI-MA                       6.20e+01
## cbsa_titleProvo-Orem-Lehi, UT                            -3.49e+00
## cbsa_titlePueblo, CO                                      4.36e+00
## cbsa_titlePunta Gorda, FL                                -1.56e+02
## cbsa_titlePutnam, CT                                      1.75e+01
## cbsa_titleQuincy, IL-MO                                  -4.66e+01
## cbsa_titleRacine-Mount Pleasant, WI                       2.42e+01
## cbsa_titleRaleigh-Cary, NC                               -6.82e+01
## cbsa_titleRapid City, SD                                  3.80e+01
## cbsa_titleReading, PA                                    -8.44e-01
## cbsa_titleRedding, CA                                    -3.47e-01
## cbsa_titleReno, NV                                        3.18e+00
## cbsa_titleRexburg, ID                                     8.90e+00
## cbsa_titleRichmond-Berea, KY                              1.43e+01
## cbsa_titleRichmond, IN                                    3.69e+01
## cbsa_titleRichmond, VA                                    1.14e+01
## cbsa_titleRifle, CO                                       6.18e+02
## cbsa_titleRio Grande City-Roma, TX                        .       
## cbsa_titleRiverside-San Bernardino-Ontario, CA            1.11e+00
## cbsa_titleRoanoke Rapids, NC                             -3.72e+01
## cbsa_titleRoanoke, VA                                    -1.85e+01
## cbsa_titleRochester, MN                                  -2.37e+01
## cbsa_titleRochester, NY                                  -2.31e+01
## cbsa_titleRockford, IL                                   -5.95e+01
## cbsa_titleRocky Mount, NC                                 2.72e+01
## cbsa_titleRome, GA                                        8.93e+00
## cbsa_titleRoseburg, OR                                    6.08e+01
## cbsa_titleRussellville, AR                               -4.11e+01
## cbsa_titleSacramento-Roseville-Folsom, CA                -6.38e+01
## cbsa_titleSaginaw, MI                                     .       
## cbsa_titleSalem, OH                                       2.55e+00
## cbsa_titleSalem, OR                                       3.54e-01
## cbsa_titleSalinas, CA                                     3.74e+02
## cbsa_titleSalisbury, MD                                   1.59e+01
## cbsa_titleSalt Lake City-Murray, UT                      -2.22e+01
## cbsa_titleSan Angelo, TX                                 -5.51e+01
## cbsa_titleSan Antonio-New Braunfels, TX                  -4.80e+01
## cbsa_titleSan Diego-Chula Vista-Carlsbad, CA              1.78e+02
## cbsa_titleSan Francisco-Oakland-Fremont, CA               8.72e+01
## cbsa_titleSan Jose-Sunnyvale-Santa Clara, CA              1.11e+02
## cbsa_titleSan Luis Obispo-Paso Robles, CA                 2.33e+02
## cbsa_titleSandusky, OH                                   -1.34e+00
## cbsa_titleSanford, NC                                     1.39e+01
## cbsa_titleSanta Cruz-Watsonville, CA                      3.84e+02
## cbsa_titleSanta Fe, NM                                    4.78e+01
## cbsa_titleSanta Maria-Santa Barbara, CA                   6.04e+02
## cbsa_titleSanta Rosa-Petaluma, CA                         1.74e+02
## cbsa_titleSavannah, GA                                    1.73e+01
## cbsa_titleScranton--Wilkes-Barre, PA                     -2.61e+01
## cbsa_titleSeaford, DE                                    -5.33e+01
## cbsa_titleSearcy, AR                                      .       
## cbsa_titleSeattle-Tacoma-Bellevue, WA                     3.28e+00
## cbsa_titleSebastian-Vero Beach-West Vero Corridor, FL    -5.05e+01
## cbsa_titleSebring, FL                                    -9.51e+01
## cbsa_titleSeneca, SC                                      8.66e+00
## cbsa_titleSevierville, TN                                 1.08e+02
## cbsa_titleShawnee, OK                                    -1.74e+01
## cbsa_titleSheboygan, WI                                  -3.78e+01
## cbsa_titleShelby-Kings Mountain, NC                       .       
## cbsa_titleShelton, WA                                    -2.41e+01
## cbsa_titleSherman-Denison, TX                             1.84e+01
## cbsa_titleShow Low, AZ                                    6.67e+01
## cbsa_titleShreveport-Bossier City, LA                     1.62e+01
## cbsa_titleSierra Vista-Douglas, AZ                       -2.04e+01
## cbsa_titleSioux City, IA-NE-SD                           -6.07e+01
## cbsa_titleSioux Falls, SD-MN                              2.12e+01
## cbsa_titleSlidell-Mandeville-Covington, LA                2.31e+01
## cbsa_titleSomerset, KY                                    2.23e+01
## cbsa_titleSomerset, PA                                   -2.91e+01
## cbsa_titleSouth Bend-Mishawaka, IN-MI                     3.09e+00
## cbsa_titleSpartanburg, SC                                 .       
## cbsa_titleSpokane-Spokane Valley, WA                      9.46e+00
## cbsa_titleSpringfield, IL                                -1.06e+02
## cbsa_titleSpringfield, MA                                 1.14e+01
## cbsa_titleSpringfield, MO                                -5.52e+00
## cbsa_titleSpringfield, OH                                -9.84e+00
## cbsa_titleSt. Cloud, MN                                   .       
## cbsa_titleSt. George, UT                                  6.09e+01
## cbsa_titleSt. Joseph, MO-KS                              -6.43e+01
## cbsa_titleSt. Louis, MO-IL                               -3.54e+01
## cbsa_titleState College, PA                              -3.32e+01
## cbsa_titleStatesboro, GA                                  8.61e+01
## cbsa_titleStaunton-Stuarts Draft, VA                      2.27e+01
## cbsa_titleStevens Point-Plover, WI                       -6.57e+00
## cbsa_titleStillwater, OK                                 -2.83e+01
## cbsa_titleStockton-Lodi, CA                              -1.03e+02
## cbsa_titleSumter, SC                                      4.03e+01
## cbsa_titleSunbury, PA                                    -1.81e+01
## cbsa_titleSyracuse, NY                                   -9.40e+00
## cbsa_titleTalladega-Sylacauga, AL                         7.19e+00
## cbsa_titleTallahassee, FL                                 2.34e+01
## cbsa_titleTampa-St. Petersburg-Clearwater, FL            -3.87e+01
## cbsa_titleTerre Haute, IN                                -1.80e+01
## cbsa_titleTexarkana, TX-AR                               -3.26e+00
## cbsa_titleThomasville, GA                                 4.39e+01
## cbsa_titleToledo, OH                                      .       
## cbsa_titleTopeka, KS                                     -4.34e+01
## cbsa_titleTorrington, CT                                  4.63e+01
## cbsa_titleTraverse City, MI                               5.03e+01
## cbsa_titleTrenton-Princeton, NJ                          -9.64e+01
## cbsa_titleTruckee-Grass Valley, CA                       -1.20e+01
## cbsa_titleTucson, AZ                                     -1.45e+01
## cbsa_titleTullahoma-Manchester, TN                        5.42e+01
## cbsa_titleTulsa, OK                                      -3.46e+01
## cbsa_titleTupelo, MS                                      3.87e+01
## cbsa_titleTuscaloosa, AL                                  2.86e+01
## cbsa_titleTwin Falls, ID                                  7.13e+01
## cbsa_titleTyler, TX                                      -3.28e+01
## cbsa_titleUkiah, CA                                       5.60e+01
## cbsa_titleUrban Honolulu, HI                             -1.46e+02
## cbsa_titleUtica-Rome, NY                                 -6.92e+00
## cbsa_titleValdosta, GA                                    4.24e+01
## cbsa_titleVallejo, CA                                    -5.93e+01
## cbsa_titleVictoria, TX                                   -1.90e+01
## cbsa_titleVineland, NJ                                    2.06e+00
## cbsa_titleVirginia Beach-Chesapeake-Norfolk, VA-NC        1.30e+01
## cbsa_titleVisalia, CA                                     .       
## cbsa_titleWaco, TX                                        2.10e+01
## cbsa_titleWalla Walla, WA                                 1.00e+01
## cbsa_titleWarner Robins, GA                              -2.59e+01
## cbsa_titleWarsaw, IN                                     -9.38e+00
## cbsa_titleWashington-Arlington-Alexandria, DC-VA-MD-WV   -4.21e+01
## cbsa_titleWaterbury-Shelton, CT                           2.41e+01
## cbsa_titleWaterloo-Cedar Falls, IA                        .       
## cbsa_titleWatertown-Fort Atkinson, WI                    -3.80e+01
## cbsa_titleWatertown-Fort Drum, NY                        -4.49e+01
## cbsa_titleWausau, WI                                     -7.01e+01
## cbsa_titleWeirton-Steubenville, WV-OH                    -1.91e+01
## cbsa_titleWenatchee-East Wenatchee, WA                    7.03e+01
## cbsa_titleWheeling, WV-OH                                -1.08e+01
## cbsa_titleWhitewater-Elkhorn, WI                         -8.18e+00
## cbsa_titleWichita Falls, TX                              -5.90e+01
## cbsa_titleWichita, KS                                    -3.74e+01
## cbsa_titleWildwood-The Villages, FL                      -1.30e+02
## cbsa_titleWilliamsport, PA                                1.15e+00
## cbsa_titleWilmington, NC                                  9.79e+00
## cbsa_titleWilson, NC                                     -8.82e+00
## cbsa_titleWinchester, VA-WV                               2.24e+01
## cbsa_titleWinston-Salem, NC                              -9.55e+00
## cbsa_titleWisconsin Rapids-Marshfield, WI                -4.40e+01
## cbsa_titleWooster, OH                                     1.61e+01
## cbsa_titleWorcester, MA                                   9.36e+00
## cbsa_titleYakima, WA                                      3.14e+01
## cbsa_titleYork-Hanover, PA                                8.13e+00
## cbsa_titleYoungstown-Warren, OH                           1.21e+01
## cbsa_titleYuba City, CA                                  -8.36e+01
## cbsa_titleYuma, AZ                                        1.55e+00
## cbsa_titleZanesville, OH                                 -7.27e+00
## total_listing_count                                      -2.85e-05
## pending_ratio                                             3.82e-01
## median_days_on_market                                    -2.27e-01
Price No CBSA
cv_lasso_price_no_cbsa = cv.glmnet(X_train_no_cbsa_matrix, price_train, alpha = 1)

lasso_price_no_cbsa_best_lambda = cv_lasso_price_no_cbsa$lambda.min
lasso_price_no_cbsa_best_lambda
## [1] 0.00871
plot(cv_lasso_price_no_cbsa)

lasso_price_no_cbsa_best_model = glmnet(X_train_no_cbsa_matrix, price_train, alpha = 1, lambda=lasso_price_no_cbsa_best_lambda)
coef(lasso_price_no_cbsa_best_model)
## 53 x 1 sparse Matrix of class "dgCMatrix"
##                                       s0
## (Intercept)             8558.64296086404
## total_population           0.00005748036
## pct_female                 2.64249225410
## pct_white                 -3.45508439291
## pct_black                 -3.97377644423
## pct_aian                  -3.56351837921
## pct_asian                 -2.76107999747
## pct_nhpi                  22.12693667884
## pct_two_or_more           -4.04460426753
## pct_hispanic               0.43562573520
## median_age                 2.39759589186
## pct_under_18              -4.37355011855
## total_households          -0.00043711653
## pct_married_couple        -4.33307565435
## pct_male_householder      -8.59148693367
## pct_female_householder    -3.63143394876
## pct_unemployed            -7.62822856157
## pct_below_poverty         -0.97023858697
## pct_income_10_14999        1.03952918534
## pct_income_15_24999        6.29423663715
## pct_income_25_34999        3.20800639128
## pct_income_35_49999        4.32468728962
## pct_income_50_74999        3.45166318929
## pct_income_75_99999        0.00137717272
## pct_income_100_149k        4.64515410501
## pct_income_150_199k        1.82586284515
## pct_income_200k_plus       8.03131887836
## median_household_income    0.00417472933
## pct_high_school_plus      -0.15404493456
## pct_bachelors_plus        -0.67637316136
## total_housing_units        0.00028596733
## pct_vacant_units           4.36141351837
## pct_owner_occupied        -3.98585115960
## pct_built_2020_plus        1.49258264384
## pct_built_2010_2019        0.58493218356
## pct_built_2000_2009        1.74238104171
## pct_built_1990_1999        5.34408170918
## pct_built_1980_1989        0.04760619616
## pct_built_1970_1979        4.42778994983
## pct_built_1960_1969        3.28417415527
## pct_built_1950_1959        1.44522674786
## pct_built_1940_1949        1.06215683361
## pct_1_bed                 -3.02376699786
## pct_2_bed                 -5.81814692770
## pct_3_bed                 -5.62966127322
## pct_4_bed                -10.03801360160
## pct_5plus_bed             -4.75030766219
## area_typeMetro           -19.11903103655
## area_typeMicro             0.00000000117
## month_date_yyyymm         -0.03785014164
## total_listing_count       -0.00427729735
## pending_ratio            -13.12349573252
## median_days_on_market      0.32987981969

Log Price

cv_lasso_log_price = cv.glmnet(X_train_matrix, log_price_train, alpha = 1)

lasso_log_price_best_lambda = cv_lasso_log_price$lambda.min
lasso_log_price_best_lambda
## [1] 0.0000929
plot(cv_lasso_log_price)

lasso_log_price_best_model = glmnet(X_train_matrix, log_price_train, alpha = 1, lambda=lasso_log_price_best_lambda)
coef(lasso_log_price_best_model)
## 581 x 1 sparse Matrix of class "dgCMatrix"
##                                                                 s0
## (Intercept)                                              -4.25e+02
## total_population                                          .       
## pct_female                                                1.82e-03
## pct_white                                                -4.52e-03
## pct_black                                                -7.74e-03
## pct_aian                                                  2.55e-04
## pct_asian                                                 7.46e-03
## pct_nhpi                                                  1.01e-02
## pct_two_or_more                                           .       
## pct_hispanic                                              1.14e-03
## median_age                                                1.40e-02
## pct_under_18                                             -4.18e-04
## total_households                                          2.79e-09
## pct_married_couple                                        .       
## pct_male_householder                                     -1.63e-02
## pct_female_householder                                   -5.09e-03
## pct_unemployed                                            2.79e-02
## pct_below_poverty                                        -1.48e-02
## pct_income_10_14999                                      -4.08e-02
## pct_income_15_24999                                      -1.18e-02
## pct_income_25_34999                                      -1.39e-02
## pct_income_35_49999                                      -1.61e-02
## pct_income_50_74999                                      -2.29e-02
## pct_income_75_99999                                      -2.08e-02
## pct_income_100_149k                                      -3.23e-04
## pct_income_150_199k                                       1.02e-03
## pct_income_200k_plus                                      1.11e-02
## median_household_income                                   .       
## pct_high_school_plus                                      .       
## pct_bachelors_plus                                        .       
## total_housing_units                                       .       
## pct_vacant_units                                          1.25e-02
## pct_owner_occupied                                       -9.68e-03
## pct_built_2020_plus                                       1.26e-02
## pct_built_2010_2019                                       .       
## pct_built_2000_2009                                       2.40e-04
## pct_built_1990_1999                                       1.02e-02
## pct_built_1980_1989                                       2.56e-03
## pct_built_1970_1979                                       1.21e-02
## pct_built_1960_1969                                      -2.67e-03
## pct_built_1950_1959                                      -5.08e-03
## pct_built_1940_1949                                      -6.02e-04
## pct_1_bed                                                 4.79e-05
## pct_2_bed                                                 6.98e-03
## pct_3_bed                                                -2.13e-06
## pct_4_bed                                                -1.57e-02
## pct_5plus_bed                                             1.14e-03
## area_typeMetro                                           -4.97e-03
## area_typeMicro                                            7.02e-17
## month_date_yyyymm                                         2.13e-03
## cbsa_titleAbilene, TX                                    -4.88e-02
## cbsa_titleAdrian, MI                                      2.46e-02
## cbsa_titleAkron, OH                                      -8.38e-02
## cbsa_titleAlamogordo, NM                                 -1.48e-01
## cbsa_titleAlbany-Schenectady-Troy, NY                     5.40e-02
## cbsa_titleAlbany, GA                                     -2.45e-01
## cbsa_titleAlbany, OR                                      2.92e-01
## cbsa_titleAlbemarle, NC                                   1.48e-01
## cbsa_titleAlbertville, AL                                 1.18e-01
## cbsa_titleAlbuquerque, NM                                -3.59e-02
## cbsa_titleAlexandria, LA                                 -2.24e-01
## cbsa_titleAllentown-Bethlehem-Easton, PA-NJ              -2.07e-02
## cbsa_titleAltoona, PA                                    -3.92e-01
## cbsa_titleAmarillo, TX                                   -1.31e-01
## cbsa_titleAmes, IA                                        2.82e-01
## cbsa_titleAmherst Town-Northampton, MA                    2.55e-01
## cbsa_titleAnchorage, AK                                  -4.60e-01
## cbsa_titleAnderson Creek, NC                              1.80e-01
## cbsa_titleAnn Arbor, MI                                  -5.21e-02
## cbsa_titleAnniston-Oxford, AL                            -2.36e-01
## cbsa_titleAppleton, WI                                    1.77e-01
## cbsa_titleAsheville, NC                                   1.36e-01
## cbsa_titleAthens-Clarke County, GA                        1.16e-01
## cbsa_titleAthens, TN                                      .       
## cbsa_titleAthens, TX                                     -1.91e-02
## cbsa_titleAtlanta-Sandy Springs-Roswell, GA              -1.61e-01
## cbsa_titleAtlantic City-Hammonton, NJ                     2.51e-01
## cbsa_titleAuburn-Opelika, AL                              1.13e-01
## cbsa_titleAuburn, NY                                     -1.47e-01
## cbsa_titleAugusta-Richmond County, GA-SC                 -3.05e-02
## cbsa_titleAugusta-Waterville, ME                          1.29e-01
## cbsa_titleAustin-Round Rock-San Marcos, TX               -2.41e-01
## cbsa_titleBakersfield-Delano, CA                          3.38e-02
## cbsa_titleBaltimore-Columbia-Towson, MD                  -1.02e-01
## cbsa_titleBangor, ME                                     -1.73e-03
## cbsa_titleBaraboo, WI                                    -2.03e-02
## cbsa_titleBarnstable Town, MA                             3.31e-01
## cbsa_titleBaton Rouge, LA                                 6.39e-02
## cbsa_titleBattle Creek, MI                                1.24e-01
## cbsa_titleBay City, MI                                    6.71e-03
## cbsa_titleBeaumont-Port Arthur, TX                       -1.99e-01
## cbsa_titleBeaver Dam, WI                                  1.06e-01
## cbsa_titleBeckley, WV                                    -1.40e-01
## cbsa_titleBellingham, WA                                  2.19e-01
## cbsa_titleBend, OR                                       -3.79e-02
## cbsa_titleBillings, MT                                    1.38e-01
## cbsa_titleBinghamton, NY                                 -8.40e-02
## cbsa_titleBirmingham, AL                                 -3.49e-02
## cbsa_titleBismarck, ND                                    9.43e-02
## cbsa_titleBlacksburg-Christiansburg-Radford, VA          -6.77e-02
## cbsa_titleBloomington, IL                                -3.60e-01
## cbsa_titleBloomington, IN                                -1.59e-02
## cbsa_titleBloomsburg-Berwick, PA                          7.33e-03
## cbsa_titleBluefield, WV-VA                               -3.37e-01
## cbsa_titleBoise City, ID                                  2.28e-01
## cbsa_titleBoston-Cambridge-Newton, MA-NH                  2.37e-01
## cbsa_titleBoulder, CO                                     2.51e-01
## cbsa_titleBowling Green, KY                              -3.72e-02
## cbsa_titleBozeman, MT                                     4.31e-01
## cbsa_titleBrainerd, MN                                   -7.31e-03
## cbsa_titleBremerton-Silverdale-Port Orchard, WA          -1.66e-01
## cbsa_titleBridgeport-Stamford-Danbury, CT                 1.49e-01
## cbsa_titleBrigham City, UT-ID                             1.76e-01
## cbsa_titleBrownsville-Harlingen, TX                       .       
## cbsa_titleBrunswick-St. Simons, GA                        1.12e-01
## cbsa_titleBuffalo-Cheektowaga, NY                         1.43e-01
## cbsa_titleBurlington-South Burlington, VT                 1.85e-01
## cbsa_titleBurlington, NC                                  .       
## cbsa_titleCanton-Massillon, OH                           -1.04e-01
## cbsa_titleCape Coral-Fort Myers, FL                      -4.13e-01
## cbsa_titleCape Girardeau, MO-IL                          -2.67e-01
## cbsa_titleCarson City, NV                                -1.46e-01
## cbsa_titleCasper, WY                                      1.07e-01
## cbsa_titleCedar City, UT                                  2.65e-01
## cbsa_titleCedar Rapids, IA                               -2.53e-02
## cbsa_titleCentralia, WA                                   1.07e-01
## cbsa_titleChambersburg, PA                               -2.69e-03
## cbsa_titleChampaign-Urbana, IL                           -1.45e-01
## cbsa_titleCharleston-North Charleston, SC                 2.10e-01
## cbsa_titleCharleston, WV                                 -3.95e-01
## cbsa_titleCharlotte-Concord-Gastonia, NC-SC              -6.12e-02
## cbsa_titleCharlottesville, VA                             6.70e-04
## cbsa_titleChattanooga, TN-GA                              1.26e-01
## cbsa_titleCheyenne, WY                                    1.83e-02
## cbsa_titleChicago-Naperville-Elgin, IL-IN                -2.10e-01
## cbsa_titleChico, CA                                       9.00e-02
## cbsa_titleChillicothe, OH                                 .       
## cbsa_titleCincinnati, OH-KY-IN                            2.98e-02
## cbsa_titleClarksburg, WV                                 -1.88e-01
## cbsa_titleClarksville, TN-KY                              8.98e-02
## cbsa_titleClearlake, CA                                   2.56e-01
## cbsa_titleCleveland, OH                                  -5.33e-02
## cbsa_titleCleveland, TN                                  -2.11e-01
## cbsa_titleClovis, NM                                      .       
## cbsa_titleCoeur d'Alene, ID                               2.58e-01
## cbsa_titleCollege Station-Bryan, TX                       3.18e-02
## cbsa_titleColorado Springs, CO                            .       
## cbsa_titleColumbia, MO                                   -7.55e-02
## cbsa_titleColumbia, SC                                    .       
## cbsa_titleColumbus, GA-AL                                -1.12e-01
## cbsa_titleColumbus, IN                                   -1.43e-01
## cbsa_titleColumbus, MS                                   -1.33e-01
## cbsa_titleColumbus, OH                                   -4.51e-02
## cbsa_titleConcord, NH                                    -7.81e-03
## cbsa_titleCookeville, TN                                  1.34e-01
## cbsa_titleCorbin, KY                                      1.57e-01
## cbsa_titleCorning, NY                                    -3.83e-01
## cbsa_titleCorpus Christi, TX                              3.01e-03
## cbsa_titleCorvallis, OR                                   1.41e-01
## cbsa_titleCrestview-Fort Walton Beach-Destin, FL          4.94e-01
## cbsa_titleCrossville, TN                                 -1.72e-02
## cbsa_titleCullman, AL                                    -3.06e-02
## cbsa_titleCumberland, MD-WV                              -2.02e-01
## cbsa_titleDallas-Fort Worth-Arlington, TX                -2.96e-01
## cbsa_titleDalton, GA                                      1.61e-01
## cbsa_titleDanville, IL                                   -6.08e-01
## cbsa_titleDanville, VA                                   -1.04e-01
## cbsa_titleDaphne-Fairhope-Foley, AL                       3.69e-01
## cbsa_titleDavenport-Moline-Rock Island, IA-IL            -1.71e-01
## cbsa_titleDayton-Kettering-Beavercreek, OH               -4.62e-02
## cbsa_titleDecatur, AL                                    -1.26e-01
## cbsa_titleDecatur, IL                                    -4.96e-01
## cbsa_titleDeltona-Daytona Beach-Ormond Beach, FL         -2.55e-02
## cbsa_titleDenver-Aurora-Centennial, CO                   -8.14e-02
## cbsa_titleDes Moines-West Des Moines, IA                  1.45e-01
## cbsa_titleDetroit-Warren-Dearborn, MI                     7.92e-02
## cbsa_titleDothan, AL                                     -9.60e-02
## cbsa_titleDover, DE                                       1.47e-01
## cbsa_titleDuBois, PA                                     -1.42e-01
## cbsa_titleDubuque, IA                                     1.28e-01
## cbsa_titleDuluth, MN-WI                                   8.63e-02
## cbsa_titleDurham-Chapel Hill, NC                         -5.28e-03
## cbsa_titleEagle Pass, TX                                  .       
## cbsa_titleEast Stroudsburg, PA                           -1.70e-01
## cbsa_titleEau Claire, WI                                 -8.61e-02
## cbsa_titleEl Centro, CA                                  -5.78e-02
## cbsa_titleEl Paso, TX                                     5.13e-03
## cbsa_titleElizabethtown, KY                              -4.75e-02
## cbsa_titleElkhart-Goshen, IN                             -1.36e-01
## cbsa_titleElmira, NY                                     -2.68e-01
## cbsa_titleEnid, OK                                       -3.60e-01
## cbsa_titleErie, PA                                        7.44e-02
## cbsa_titleEugene-Springfield, OR                          3.21e-01
## cbsa_titleEureka-Arcata, CA                               3.97e-01
## cbsa_titleEvansville, IN                                 -1.88e-01
## cbsa_titleFairbanks-College, AK                          -4.00e-01
## cbsa_titleFargo, ND-MN                                   -2.20e-02
## cbsa_titleFaribault-Northfield, MN                        1.04e-01
## cbsa_titleFarmington, MO                                 -5.06e-02
## cbsa_titleFarmington, NM                                  3.58e-03
## cbsa_titleFayetteville-Springdale-Rogers, AR             -1.85e-01
## cbsa_titleFayetteville, NC                                1.71e-01
## cbsa_titleFindlay, OH                                    -2.37e-01
## cbsa_titleFlagstaff, AZ                                   2.52e-01
## cbsa_titleFlint, MI                                      -1.26e-02
## cbsa_titleFlorence-Muscle Shoals, AL                     -1.83e-01
## cbsa_titleFlorence, SC                                   -1.36e-02
## cbsa_titleFond du Lac, WI                                 1.42e-01
## cbsa_titleForest City, NC                                 3.98e-02
## cbsa_titleFort Collins-Loveland, CO                      -5.01e-02
## cbsa_titleFort Payne, AL                                  2.23e-01
## cbsa_titleFort Smith, AR-OK                              -2.32e-01
## cbsa_titleFort Wayne, IN                                  8.82e-02
## cbsa_titleFrankfort, KY                                  -3.40e-02
## cbsa_titleFresno, CA                                     -1.65e-02
## cbsa_titleGadsden, AL                                     8.10e-02
## cbsa_titleGainesville, FL                                 2.26e-01
## cbsa_titleGainesville, GA                                -1.41e-01
## cbsa_titleGallup, NM                                     -2.26e-01
## cbsa_titleGettysburg, PA                                 -1.21e-02
## cbsa_titleGlens Falls, NY                                 1.37e-01
## cbsa_titleGoldsboro, NC                                   2.09e-01
## cbsa_titleGranbury, TX                                   -1.36e-02
## cbsa_titleGrand Forks, ND-MN                             -1.41e-01
## cbsa_titleGrand Island, NE                               -4.38e-01
## cbsa_titleGrand Junction, CO                              2.67e-01
## cbsa_titleGrand Rapids-Wyoming-Kentwood, MI               1.36e-01
## cbsa_titleGrants Pass, OR                                 1.96e-01
## cbsa_titleGreat Falls, MT                                 4.70e-01
## cbsa_titleGreeley, CO                                    -1.04e-01
## cbsa_titleGreen Bay, WI                                   8.32e-02
## cbsa_titleGreeneville, TN                                 2.60e-01
## cbsa_titleGreenfield, MA                                  1.50e-01
## cbsa_titleGreensboro-High Point, NC                       3.89e-02
## cbsa_titleGreenville-Anderson-Greer, SC                  -4.42e-02
## cbsa_titleGreenville, NC                                 -4.58e-02
## cbsa_titleGreenwood, SC                                   2.71e-01
## cbsa_titleGulfport-Biloxi, MS                             4.43e-02
## cbsa_titleHagerstown-Martinsburg, MD-WV                   6.49e-02
## cbsa_titleHammond, LA                                     1.93e-01
## cbsa_titleHanford-Corcoran, CA                           -7.40e-02
## cbsa_titleHarrisburg-Carlisle, PA                        -8.84e-02
## cbsa_titleHarrisonburg, VA                                1.89e-01
## cbsa_titleHartford-West Hartford-East Hartford, CT       -9.16e-02
## cbsa_titleHattiesburg, MS                                 2.61e-02
## cbsa_titleHeber, UT                                       1.12e-01
## cbsa_titleHelena, MT                                      7.78e-02
## cbsa_titleHermiston-Pendleton, OR                         2.07e-01
## cbsa_titleHermitage, PA                                   9.61e-02
## cbsa_titleHickory-Lenoir-Morganton, NC                    3.71e-02
## cbsa_titleHilo-Kailua, HI                                 1.88e-02
## cbsa_titleHilton Head Island-Bluffton-Port Royal, SC      1.78e-01
## cbsa_titleHinesville, GA                                  5.97e-02
## cbsa_titleHobbs, NM                                      -4.37e-01
## cbsa_titleHolland, MI                                     1.52e-01
## cbsa_titleHomosassa Springs, FL                          -4.01e-01
## cbsa_titleHot Springs, AR                                -3.91e-02
## cbsa_titleHouma-Bayou Cane-Thibodaux, LA                 -1.27e-01
## cbsa_titleHouston-Pasadena-The Woodlands, TX             -3.57e-01
## cbsa_titleHuntington-Ashland, WV-KY-OH                   -3.14e-01
## cbsa_titleHuntsville, AL                                 -2.35e-02
## cbsa_titleHuntsville, TX                                 -6.76e-02
## cbsa_titleIdaho Falls, ID                                -1.50e-01
## cbsa_titleIndiana, PA                                    -3.44e-01
## cbsa_titleIndianapolis-Carmel-Greenwood, IN              -1.88e-01
## cbsa_titleIowa City, IA                                  -2.64e-02
## cbsa_titleIthaca, NY                                     -9.87e-04
## cbsa_titleJackson, MI                                    -1.84e-02
## cbsa_titleJackson, MS                                     1.01e-01
## cbsa_titleJackson, TN                                     2.59e-01
## cbsa_titleJacksonville, FL                                1.51e-02
## cbsa_titleJacksonville, NC                                1.33e-01
## cbsa_titleJamestown-Dunkirk, NY                          -8.73e-02
## cbsa_titleJanesville-Beloit, WI                           1.20e-01
## cbsa_titleJefferson City, MO                             -1.11e-01
## cbsa_titleJefferson, GA                                  -1.38e-01
## cbsa_titleJohnson City, TN                                2.56e-01
## cbsa_titleJohnstown, PA                                  -3.52e-01
## cbsa_titleJonesboro, AR                                  -4.74e-02
## cbsa_titleJoplin, MO-KS                                  -9.15e-02
## cbsa_titleKahului-Wailuku, HI                             3.36e-01
## cbsa_titleKalamazoo-Portage, MI                          -3.00e-02
## cbsa_titleKalispell, MT                                   6.84e-01
## cbsa_titleKankakee, IL                                   -1.76e-01
## cbsa_titleKansas City, MO-KS                              .       
## cbsa_titleKapaa, HI                                       3.95e-01
## cbsa_titleKeene, NH                                      -2.24e-02
## cbsa_titleKennewick-Richland, WA                          1.47e-01
## cbsa_titleKenosha, WI                                    -2.19e-02
## cbsa_titleKey West-Key Largo, FL                          3.00e-01
## cbsa_titleKilleen-Temple, TX                              1.64e-01
## cbsa_titleKingsport-Bristol, TN-VA                       -1.24e-02
## cbsa_titleKingston, NY                                    1.69e-01
## cbsa_titleKiryas Joel-Poughkeepsie-Newburgh, NY           7.28e-02
## cbsa_titleKlamath Falls, OR                               3.05e-02
## cbsa_titleKnoxville, TN                                   1.88e-01
## cbsa_titleKokomo, IN                                     -1.52e-01
## cbsa_titleLa Crosse-Onalaska, WI-MN                       3.03e-02
## cbsa_titleLaconia, NH                                    -1.87e-01
## cbsa_titleLafayette-West Lafayette, IN                    8.62e-02
## cbsa_titleLafayette, LA                                  -5.38e-03
## cbsa_titleLaGrange, GA-AL                                 6.53e-02
## cbsa_titleLake Charles, LA                               -2.21e-01
## cbsa_titleLake City, FL                                   1.37e-01
## cbsa_titleLake Havasu City-Kingman, AZ                   -6.64e-03
## cbsa_titleLakeland-Winter Haven, FL                       1.25e-02
## cbsa_titleLancaster, PA                                   2.68e-02
## cbsa_titleLansing-East Lansing, MI                       -1.77e-01
## cbsa_titleLaredo, TX                                     -1.44e-01
## cbsa_titleLas Cruces, NM                                 -5.87e-02
## cbsa_titleLas Vegas-Henderson-North Las Vegas, NV        -1.57e-01
## cbsa_titleLaurel, MS                                      1.24e-01
## cbsa_titleLawrence, KS                                    5.97e-02
## cbsa_titleLawton, OK                                     -2.38e-01
## cbsa_titleLebanon-Claremont, NH-VT                        1.84e-02
## cbsa_titleLebanon, PA                                     8.70e-02
## cbsa_titleLewiston-Auburn, ME                             3.08e-01
## cbsa_titleLewiston, ID-WA                                 1.95e-01
## cbsa_titleLexington Park, MD                             -3.20e-01
## cbsa_titleLexington-Fayette, KY                          -3.99e-03
## cbsa_titleLima, OH                                       -9.48e-02
## cbsa_titleLincoln, NE                                    -8.27e-03
## cbsa_titleLittle Rock-North Little Rock-Conway, AR       -7.02e-02
## cbsa_titleLogan, UT-ID                                    1.45e-01
## cbsa_titleLongview-Kelso, WA                              2.67e-01
## cbsa_titleLongview, TX                                   -1.17e-01
## cbsa_titleLos Angeles-Long Beach-Anaheim, CA              4.52e-01
## cbsa_titleLouisville/Jefferson County, KY-IN             -2.57e-02
## cbsa_titleLubbock, TX                                    -2.51e-01
## cbsa_titleLufkin, TX                                     -5.66e-02
## cbsa_titleLumberton, NC                                  -4.78e-02
## cbsa_titleLynchburg, VA                                   1.63e-01
## cbsa_titleMacon-Bibb County, GA                          -2.38e-01
## cbsa_titleMadison, WI                                    -6.91e-02
## cbsa_titleManchester-Nashua, NH                          -1.27e-01
## cbsa_titleManhattan, KS                                  -2.73e-01
## cbsa_titleManitowoc, WI                                   4.93e-03
## cbsa_titleMankato, MN                                    -1.56e-01
## cbsa_titleMansfield, OH                                   3.42e-02
## cbsa_titleMarinette, WI-MI                               -1.24e-01
## cbsa_titleMarion-Herrin, IL                              -5.15e-01
## cbsa_titleMarion, IN                                      1.21e-01
## cbsa_titleMarquette, MI                                   5.29e-02
## cbsa_titleMassena-Ogdensburg, NY                         -2.66e-01
## cbsa_titleMcAllen-Edinburg-Mission, TX                   -2.29e-01
## cbsa_titleMeadville, PA                                  -1.45e-01
## cbsa_titleMedford, OR                                     1.75e-01
## cbsa_titleMemphis, TN-MS-AR                               8.86e-02
## cbsa_titleMerced, CA                                      1.97e-01
## cbsa_titleMeridian, MS                                   -4.72e-01
## cbsa_titleMiami-Fort Lauderdale-West Palm Beach, FL       1.01e-01
## cbsa_titleMichigan City-La Porte, IN                      5.84e-02
## cbsa_titleMidland, MI                                    -5.57e-03
## cbsa_titleMidland, TX                                    -1.28e-01
## cbsa_titleMilwaukee-Waukesha, WI                          6.60e-02
## cbsa_titleMinneapolis-St. Paul-Bloomington, MN-WI        -1.47e-01
## cbsa_titleMinot, ND                                      -1.94e-01
## cbsa_titleMissoula, MT                                    4.44e-01
## cbsa_titleMobile, AL                                      9.44e-02
## cbsa_titleModesto, CA                                     3.29e-02
## cbsa_titleMonroe, LA                                     -5.13e-02
## cbsa_titleMonroe, MI                                      2.43e-01
## cbsa_titleMontgomery, AL                                  6.07e-03
## cbsa_titleMonticello, NY                                  1.12e-01
## cbsa_titleMorehead City, NC                               8.37e-03
## cbsa_titleMorgantown, WV                                 -1.31e-01
## cbsa_titleMorristown, TN                                  3.70e-01
## cbsa_titleMoses Lake, WA                                  2.29e-01
## cbsa_titleMount Airy, NC                                  1.19e-02
## cbsa_titleMount Pleasant, MI                              1.29e-02
## cbsa_titleMount Vernon-Anacortes, WA                      6.82e-02
## cbsa_titleMuncie, IN                                     -2.41e-01
## cbsa_titleMurrells Inlet, SC                              2.12e-01
## cbsa_titleMuskegon-Norton Shores, MI                      3.71e-01
## cbsa_titleMuskogee, OK                                   -2.58e-01
## cbsa_titleMyrtle Beach-Conway-North Myrtle Beach, SC     -3.69e-02
## cbsa_titleNacogdoches, TX                                -1.28e-01
## cbsa_titleNapa, CA                                        3.39e-01
## cbsa_titleNaples-Marco Island, FL                        -2.79e-01
## cbsa_titleNashville-Davidson--Murfreesboro--Franklin, TN  6.67e-02
## cbsa_titleNew Bern, NC                                    1.81e-01
## cbsa_titleNew Haven, CT                                  -5.60e-03
## cbsa_titleNew Iberia, LA                                 -2.49e-01
## cbsa_titleNew Orleans-Metairie, LA                        1.51e-01
## cbsa_titleNew Philadelphia-Dover, OH                     -1.38e-01
## cbsa_titleNew York-Newark-Jersey City, NY-NJ              4.38e-01
## cbsa_titleNiles, MI                                       1.40e-01
## cbsa_titleNorth Port-Bradenton-Sarasota, FL              -3.53e-01
## cbsa_titleNorth Wilkesboro, NC                            2.69e-01
## cbsa_titleNorwich-New London-Willimantic, CT              8.62e-02
## cbsa_titleOak Harbor, WA                                  4.71e-02
## cbsa_titleOcala, FL                                      -2.11e-01
## cbsa_titleOdessa, TX                                     -1.31e-01
## cbsa_titleOgden, UT                                       6.06e-03
## cbsa_titleOklahoma City, OK                              -5.08e-02
## cbsa_titleOlean, NY                                      -3.48e-02
## cbsa_titleOlympia-Lacey-Tumwater, WA                     -1.02e-01
## cbsa_titleOmaha, NE-IA                                   -9.24e-02
## cbsa_titleOpelousas, LA                                  -1.96e-01
## cbsa_titleOrangeburg, SC                                 -2.17e-02
## cbsa_titleOrlando-Kissimmee-Sanford, FL                  -5.05e-02
## cbsa_titleOshkosh-Neenah, WI                              3.96e-02
## cbsa_titleOttawa, IL                                     -2.67e-01
## cbsa_titleOwensboro, KY                                  -1.26e-02
## cbsa_titleOwosso, MI                                     -1.05e-01
## cbsa_titleOxford, MS                                     -3.87e-02
## cbsa_titleOxnard-Thousand Oaks-Ventura, CA                2.67e-01
## cbsa_titlePaducah, KY-IL                                 -3.52e-01
## cbsa_titlePalatka, FL                                     2.70e-01
## cbsa_titlePalm Bay-Melbourne-Titusville, FL               3.99e-02
## cbsa_titlePanama City-Panama City Beach, FL               1.29e-01
## cbsa_titleParkersburg-Vienna, WV                         -2.98e-01
## cbsa_titlePensacola-Ferry Pass-Brent, FL                  1.05e-01
## cbsa_titlePeoria, IL                                     -4.77e-01
## cbsa_titlePhiladelphia-Camden-Wilmington, PA-NJ-DE-MD     9.31e-02
## cbsa_titlePhoenix-Mesa-Chandler, AZ                      -8.01e-03
## cbsa_titlePikeville, KY                                  -2.93e-01
## cbsa_titlePine Bluff, AR                                 -4.93e-01
## cbsa_titlePinehurst-Southern Pines, NC                   -8.55e-03
## cbsa_titlePittsburgh, PA                                 -5.66e-02
## cbsa_titlePittsfield, MA                                  3.75e-01
## cbsa_titlePlattsburgh, NY                                -1.06e-01
## cbsa_titlePocatello, ID                                   6.48e-02
## cbsa_titlePort Angeles, WA                                6.84e-02
## cbsa_titlePort St. Lucie, FL                             -1.79e-01
## cbsa_titlePortland-South Portland, ME                     2.63e-01
## cbsa_titlePortland-Vancouver-Hillsboro, OR-WA            -7.16e-02
## cbsa_titlePortsmouth, OH                                  .       
## cbsa_titlePottsville, PA                                 -4.17e-01
## cbsa_titlePrescott Valley-Prescott, AZ                    5.45e-02
## cbsa_titleProvidence-Warwick, RI-MA                       2.55e-01
## cbsa_titleProvo-Orem-Lehi, UT                            -1.52e-01
## cbsa_titlePueblo, CO                                      9.11e-02
## cbsa_titlePunta Gorda, FL                                -5.76e-01
## cbsa_titlePutnam, CT                                      1.28e-01
## cbsa_titleQuincy, IL-MO                                  -3.06e-01
## cbsa_titleRacine-Mount Pleasant, WI                       1.04e-01
## cbsa_titleRaleigh-Cary, NC                               -2.69e-01
## cbsa_titleRapid City, SD                                  2.05e-01
## cbsa_titleReading, PA                                    -8.73e-02
## cbsa_titleRedding, CA                                     1.18e-01
## cbsa_titleReno, NV                                        1.62e-03
## cbsa_titleRexburg, ID                                     6.36e-02
## cbsa_titleRichmond-Berea, KY                              1.29e-01
## cbsa_titleRichmond, IN                                    4.07e-03
## cbsa_titleRichmond, VA                                    1.18e-01
## cbsa_titleRifle, CO                                       1.06e+00
## cbsa_titleRio Grande City-Roma, TX                        .       
## cbsa_titleRiverside-San Bernardino-Ontario, CA            6.53e-02
## cbsa_titleRoanoke Rapids, NC                             -9.17e-02
## cbsa_titleRoanoke, VA                                    -3.45e-02
## cbsa_titleRochester, MN                                  -1.46e-02
## cbsa_titleRochester, NY                                  -6.85e-02
## cbsa_titleRockford, IL                                   -3.03e-01
## cbsa_titleRocky Mount, NC                                 1.39e-02
## cbsa_titleRome, GA                                        9.01e-02
## cbsa_titleRoseburg, OR                                    3.18e-01
## cbsa_titleRussellville, AR                               -1.46e-01
## cbsa_titleSacramento-Roseville-Folsom, CA                -4.09e-02
## cbsa_titleSaginaw, MI                                    -2.89e-02
## cbsa_titleSalem, OH                                      -7.60e-02
## cbsa_titleSalem, OR                                      -1.24e-02
## cbsa_titleSalinas, CA                                     6.88e-01
## cbsa_titleSalisbury, MD                                   9.49e-02
## cbsa_titleSalt Lake City-Murray, UT                      -4.10e-02
## cbsa_titleSan Angelo, TX                                 -3.64e-01
## cbsa_titleSan Antonio-New Braunfels, TX                  -2.33e-01
## cbsa_titleSan Diego-Chula Vista-Carlsbad, CA              1.83e-01
## cbsa_titleSan Francisco-Oakland-Fremont, CA               1.90e-02
## cbsa_titleSan Jose-Sunnyvale-Santa Clara, CA             -1.28e-01
## cbsa_titleSan Luis Obispo-Paso Robles, CA                 3.25e-01
## cbsa_titleSandusky, OH                                    6.07e-02
## cbsa_titleSanford, NC                                     1.33e-01
## cbsa_titleSanta Cruz-Watsonville, CA                      4.97e-01
## cbsa_titleSanta Fe, NM                                    1.18e-01
## cbsa_titleSanta Maria-Santa Barbara, CA                   9.48e-01
## cbsa_titleSanta Rosa-Petaluma, CA                         1.88e-01
## cbsa_titleSavannah, GA                                    9.45e-02
## cbsa_titleScranton--Wilkes-Barre, PA                     -1.55e-01
## cbsa_titleSeaford, DE                                    -2.29e-01
## cbsa_titleSearcy, AR                                      3.22e-02
## cbsa_titleSeattle-Tacoma-Bellevue, WA                     6.86e-03
## cbsa_titleSebastian-Vero Beach-West Vero Corridor, FL    -2.37e-01
## cbsa_titleSebring, FL                                    -2.97e-01
## cbsa_titleSeneca, SC                                     -3.69e-02
## cbsa_titleSevierville, TN                                 3.65e-01
## cbsa_titleShawnee, OK                                    -8.16e-02
## cbsa_titleSheboygan, WI                                   4.78e-02
## cbsa_titleShelby-Kings Mountain, NC                       6.96e-02
## cbsa_titleShelton, WA                                    -1.24e-01
## cbsa_titleSherman-Denison, TX                             2.29e-02
## cbsa_titleShow Low, AZ                                    3.70e-01
## cbsa_titleShreveport-Bossier City, LA                     2.64e-02
## cbsa_titleSierra Vista-Douglas, AZ                        1.11e-02
## cbsa_titleSioux City, IA-NE-SD                           -1.67e-01
## cbsa_titleSioux Falls, SD-MN                              7.73e-02
## cbsa_titleSlidell-Mandeville-Covington, LA                .       
## cbsa_titleSomerset, KY                                    2.38e-01
## cbsa_titleSomerset, PA                                   -1.23e-01
## cbsa_titleSouth Bend-Mishawaka, IN-MI                     7.37e-03
## cbsa_titleSpartanburg, SC                                -1.05e-02
## cbsa_titleSpokane-Spokane Valley, WA                      7.45e-02
## cbsa_titleSpringfield, IL                                -6.23e-01
## cbsa_titleSpringfield, MA                                 2.07e-01
## cbsa_titleSpringfield, MO                                -1.71e-02
## cbsa_titleSpringfield, OH                                -2.13e-02
## cbsa_titleSt. Cloud, MN                                   8.38e-03
## cbsa_titleSt. George, UT                                  1.40e-01
## cbsa_titleSt. Joseph, MO-KS                              -3.08e-01
## cbsa_titleSt. Louis, MO-IL                               -1.35e-01
## cbsa_titleState College, PA                              -5.76e-02
## cbsa_titleStatesboro, GA                                  5.32e-01
## cbsa_titleStaunton-Stuarts Draft, VA                      2.80e-01
## cbsa_titleStevens Point-Plover, WI                        4.48e-02
## cbsa_titleStillwater, OK                                 -1.06e-02
## cbsa_titleStockton-Lodi, CA                              -7.65e-02
## cbsa_titleSumter, SC                                      2.06e-01
## cbsa_titleSunbury, PA                                    -1.50e-01
## cbsa_titleSyracuse, NY                                    2.19e-03
## cbsa_titleTalladega-Sylacauga, AL                         7.93e-02
## cbsa_titleTallahassee, FL                                 1.11e-01
## cbsa_titleTampa-St. Petersburg-Clearwater, FL            -1.05e-01
## cbsa_titleTerre Haute, IN                                -2.59e-01
## cbsa_titleTexarkana, TX-AR                               -1.95e-01
## cbsa_titleThomasville, GA                                 2.08e-01
## cbsa_titleToledo, OH                                     -4.59e-02
## cbsa_titleTopeka, KS                                     -2.45e-01
## cbsa_titleTorrington, CT                                  1.68e-01
## cbsa_titleTraverse City, MI                               2.05e-01
## cbsa_titleTrenton-Princeton, NJ                          -2.55e-01
## cbsa_titleTruckee-Grass Valley, CA                       -2.75e-01
## cbsa_titleTucson, AZ                                      2.32e-04
## cbsa_titleTullahoma-Manchester, TN                        2.43e-01
## cbsa_titleTulsa, OK                                      -1.52e-01
## cbsa_titleTupelo, MS                                      1.53e-01
## cbsa_titleTuscaloosa, AL                                  1.45e-01
## cbsa_titleTwin Falls, ID                                  2.42e-01
## cbsa_titleTyler, TX                                      -1.69e-01
## cbsa_titleUkiah, CA                                       1.74e-01
## cbsa_titleUrban Honolulu, HI                             -5.94e-02
## cbsa_titleUtica-Rome, NY                                  7.50e-02
## cbsa_titleValdosta, GA                                    2.46e-01
## cbsa_titleVallejo, CA                                    -4.15e-02
## cbsa_titleVictoria, TX                                   -4.31e-02
## cbsa_titleVineland, NJ                                    5.56e-02
## cbsa_titleVirginia Beach-Chesapeake-Norfolk, VA-NC        1.58e-01
## cbsa_titleVisalia, CA                                    -3.53e-02
## cbsa_titleWaco, TX                                        8.96e-02
## cbsa_titleWalla Walla, WA                                -3.37e-02
## cbsa_titleWarner Robins, GA                              -1.58e-01
## cbsa_titleWarsaw, IN                                     -3.75e-03
## cbsa_titleWashington-Arlington-Alexandria, DC-VA-MD-WV   -1.73e-01
## cbsa_titleWaterbury-Shelton, CT                           1.83e-01
## cbsa_titleWaterloo-Cedar Falls, IA                        5.21e-02
## cbsa_titleWatertown-Fort Atkinson, WI                    -6.87e-02
## cbsa_titleWatertown-Fort Drum, NY                        -2.94e-01
## cbsa_titleWausau, WI                                     -1.76e-01
## cbsa_titleWeirton-Steubenville, WV-OH                    -2.44e-01
## cbsa_titleWenatchee-East Wenatchee, WA                    9.28e-02
## cbsa_titleWheeling, WV-OH                                -1.18e-01
## cbsa_titleWhitewater-Elkhorn, WI                         -2.92e-02
## cbsa_titleWichita Falls, TX                              -3.15e-01
## cbsa_titleWichita, KS                                    -1.43e-01
## cbsa_titleWildwood-The Villages, FL                      -4.37e-01
## cbsa_titleWilliamsport, PA                               -3.41e-03
## cbsa_titleWilmington, NC                                  7.66e-03
## cbsa_titleWilson, NC                                      5.41e-03
## cbsa_titleWinchester, VA-WV                              -5.27e-04
## cbsa_titleWinston-Salem, NC                               2.36e-02
## cbsa_titleWisconsin Rapids-Marshfield, WI                -1.17e-01
## cbsa_titleWooster, OH                                     1.02e-01
## cbsa_titleWorcester, MA                                   6.97e-02
## cbsa_titleYakima, WA                                      1.43e-01
## cbsa_titleYork-Hanover, PA                               -9.96e-03
## cbsa_titleYoungstown-Warren, OH                           1.32e-03
## cbsa_titleYuba City, CA                                  -1.88e-02
## cbsa_titleYuma, AZ                                        1.57e-02
## cbsa_titleZanesville, OH                                 -8.62e-02
## total_listing_count                                      -4.60e-07
## pending_ratio                                             2.17e-02
## median_days_on_market                                    -9.51e-04
Log Price No CBSA
cv_lasso_log_price_no_cbsa = cv.glmnet(X_train_no_cbsa_matrix, log_price_train, alpha = 1)

lasso_log_price_no_cbsa_best_lambda = cv_lasso_log_price$lambda.min
lasso_log_price_no_cbsa_best_lambda
## [1] 0.0000929
plot(cv_lasso_log_price_no_cbsa)

lasso_log_price_no_cbsa_best_model = glmnet(X_train_no_cbsa_matrix, log_price_train, alpha = 1, lambda=lasso_log_price_no_cbsa_best_lambda)
coef(lasso_log_price_no_cbsa_best_model)
## 53 x 1 sparse Matrix of class "dgCMatrix"
##                                s0
## (Intercept)             -1.62e+02
## total_population         1.67e-08
## pct_female               2.23e-02
## pct_white               -1.75e-02
## pct_black               -2.13e-02
## pct_aian                -1.43e-02
## pct_asian               -1.82e-02
## pct_nhpi                 2.14e-02
## pct_two_or_more         -1.46e-02
## pct_hispanic            -1.31e-03
## median_age               8.33e-03
## pct_under_18            -2.29e-02
## total_households         .       
## pct_married_couple      -1.33e-02
## pct_male_householder    -2.82e-02
## pct_female_householder  -1.46e-02
## pct_unemployed          -4.11e-03
## pct_below_poverty       -7.15e-03
## pct_income_10_14999      1.37e-02
## pct_income_15_24999      1.55e-02
## pct_income_25_34999      1.42e-02
## pct_income_35_49999      1.83e-02
## pct_income_50_74999      1.68e-02
## pct_income_75_99999      1.94e-02
## pct_income_100_149k      3.41e-02
## pct_income_150_199k      3.99e-02
## pct_income_200k_plus     5.01e-02
## median_household_income  8.58e-08
## pct_high_school_plus    -3.78e-04
## pct_bachelors_plus      -1.19e-03
## total_housing_units      3.34e-08
## pct_vacant_units         1.10e-02
## pct_owner_occupied      -1.36e-02
## pct_built_2020_plus      1.70e-02
## pct_built_2010_2019      2.01e-03
## pct_built_2000_2009      8.14e-03
## pct_built_1990_1999      1.94e-02
## pct_built_1980_1989      5.71e-03
## pct_built_1970_1979      1.65e-02
## pct_built_1960_1969      5.18e-03
## pct_built_1950_1959      5.10e-03
## pct_built_1940_1949      6.84e-03
## pct_1_bed               -1.05e-02
## pct_2_bed               -1.40e-02
## pct_3_bed               -1.66e-02
## pct_4_bed               -2.65e-02
## pct_5plus_bed           -1.26e-02
## area_typeMetro          -6.41e-02
## area_typeMicro           4.82e-13
## month_date_yyyymm        8.33e-04
## total_listing_count     -1.18e-05
## pending_ratio           -5.43e-02
## median_days_on_market    2.19e-04

Regression Tree

## ---- RT_simple ----
options(scipen = 999)   # turn off scientific notation

# # Keep only interpretable economic drivers
# inventory_model_simple <- full_dataset_clean %>%
#   select(
#     median_listing_price_per_square_foot, 
#     total_listing_count,                   
#     pending_ratio,                        
#     median_days_on_market                 
#   )
# 
# # Fit a simple, readable regression tree
# reg_tree_simple <- rpart(
#   median_listing_price_per_square_foot ~
#     total_listing_count + pending_ratio + median_days_on_market,
#   data = inventory_model_simple,
#   method = "anova",
#   control = rpart.control(maxdepth = 3, minbucket = 200)  
# )
# 
# # Plot the tree
# rpart.plot(
#   reg_tree_simple,
#   type = 2,
#   extra = 101,
#   under = TRUE,
#   fallen.leaves = TRUE,
#   roundint = TRUE,
#   main = "Regression Tree: Price per Sq.Ft"
# )

# Basic check: supply distribution
summary(full_dataset_clean$total_listing_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      40     335     626    2090    1633   52936
#stephs expanded regression tree (median)
tree_data_med <- train %>%
  select(-log_median_price_sqft,
         -cbsa_title) %>%
  na.omit()

set.seed(42)
reg_tree_exp_med <- rpart(median_listing_price_per_square_foot ~ .,
                      data = tree_data_med,
                      method = "anova",
                      control = rpart.control(minbucket = 200))

best_cp_med <- reg_tree_exp_med$cptable[which.min(reg_tree_exp_med$cptable[,"xerror"]), "CP"]
best_cp_med
## [1] 0.01
pruned_tree_data_med <- prune(reg_tree_exp_med, 
                              cp = best_cp_med)

options(repr.plot.width = 28, 
        repr.plot.height = 14)

fancyRpartPlot(reg_tree_exp_med,
               main = "Expanded Regression Tree: Median Price per Sq.Ft",
               sub = paste0("n=", nrow(tree_data_med), "  |  maxdepth=", reg_tree_exp_med$control$maxdepth),
               cex = 1.4,
               tweak = 0.5)   

Random Forest Regression

set.seed(42)
rf_data_med <- train %>%
  mutate(across(where(is.character), as.factor)) %>%
  select(-log_median_price_sqft,-cbsa_title ) %>% #log(price) is basically price. ; cbsa can't be used anyway
  na.omit()                   # Can not handle categorical predictors with more than 53 categories.

rf_exp_med <- randomForest(
  median_listing_price_per_square_foot ~ .,
  data = rf_data_med,
  ntree = 500,
  mtry = floor(sqrt(ncol(rf_data_med) - 1)),
  nodesize = 15,
  importance = TRUE,
  na.action = na.omit
)
imp <- randomForest::importance(rf_exp_med)
imp_df <- data.frame(
  Feature = rownames(imp),
  Importance = imp[, "%IncMSE"]
)

imp_df %>%
  top_n(25, Importance) %>% #top 25 indicators
  ggplot(aes(x=reorder(Feature, Importance), y=Importance)) +
  geom_col(fill="steelblue") +
  coord_flip() +
  labs(title="Top 25 Key Indicators",
       x="Feature",
       y="MSE-contribution (%)") +
  theme_minimal(base_size = 13)

#### Random Forest with Log

set.seed(42)
rf_data_log <- train %>%
  select(-median_listing_price_per_square_foot,-cbsa_title ) #log(price) is basically price. ; cbsa can't be used anyway                   # Can not handle categorical predictors with more than 53 categories.

rf_exp_log <- randomForest(
  log_median_price_sqft ~ .,
  data = rf_data_log,
  ntree = 500,
  mtry = floor(sqrt(ncol(rf_data_log) - 1)),
  nodesize = 15,
  importance = TRUE,
  na.action = na.omit
)
imp_log <- randomForest::importance(rf_exp_log)
imp_log_df <- data.frame(
  Feature = rownames(imp_log),
  Importance = imp[, "%IncMSE"]
)

imp_log_df %>%
  top_n(25, Importance) %>% #top 25 indicators
  ggplot(aes(x=reorder(Feature, Importance), y=Importance)) +
  geom_col(fill="steelblue") +
  coord_flip() +
  labs(title="Top 25 Key Indicators",
       x="Feature",
       y="MSE-contribution (%)") +
  theme_minimal(base_size = 13)

Model Evaluation

```

#data manipulation EDA- summary statistics(baylee) Modeling- linear regressions (alex) Modeling- lasso regression (alex) – probably lots of options here Modeling- regression tree (steph) Modeling- regression forest (ilgaz)